diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-amdpal.ll +++ /dev/null @@ -1,526 +0,0 @@ -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,CACHE_INV %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN10,CACHE_INV10 %s - -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN9,SKIP_CACHE_INV %s -; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GCN10,SKIP_CACHE_INV %s - - -; FUNC-LABEL: {{^}}system_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @system_acquire() { -entry: - fence acquire - ret void -} - -; FUNC-LABEL: {{^}}system_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN: s_endpgm -define amdgpu_kernel void @system_release() { -entry: - fence release - ret void -} - -; FUNC-LABEL: {{^}}system_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @system_acq_rel() { -entry: - fence acq_rel - ret void -} - -; FUNC-LABEL: {{^}}system_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @system_seq_cst() { -entry: - fence seq_cst - ret void -} - -; FUNC-LABEL: {{^}}system_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @system_one_as_acquire() { -entry: - fence syncscope("one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}system_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN: s_endpgm -define amdgpu_kernel void @system_one_as_release() { -entry: - fence syncscope("one-as") release - ret void -} - -; FUNC-LABEL: {{^}}system_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @system_one_as_acq_rel() { -entry: - fence syncscope("one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}system_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @system_one_as_seq_cst() { -entry: - fence syncscope("one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}singlethread_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_acquire() { -entry: - fence syncscope("singlethread") acquire - ret void -} - -; FUNC-LABEL: {{^}}singlethread_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_release() { -entry: - fence syncscope("singlethread") release - ret void -} - -; FUNC-LABEL: {{^}}singlethread_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_acq_rel() { -entry: - fence syncscope("singlethread") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}singlethread_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_seq_cst() { -entry: - fence syncscope("singlethread") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_one_as_acquire() { -entry: - fence syncscope("singlethread-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_one_as_release() { -entry: - fence syncscope("singlethread-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_one_as_acq_rel() { -entry: - fence syncscope("singlethread-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_one_as_seq_cst() { -entry: - fence syncscope("singlethread-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}agent_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @agent_acquire() { -entry: - fence syncscope("agent") acquire - ret void -} - -; FUNC-LABEL: {{^}}agent_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN: s_endpgm -define amdgpu_kernel void @agent_release() { -entry: - fence syncscope("agent") release - ret void -} - -; FUNC-LABEL: {{^}}agent_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @agent_acq_rel() { -entry: - fence syncscope("agent") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}agent_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @agent_seq_cst() { -entry: - fence syncscope("agent") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @agent_one_as_acquire() { -entry: - fence syncscope("agent-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN: s_endpgm -define amdgpu_kernel void @agent_one_as_release() { -entry: - fence syncscope("agent-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @agent_one_as_acq_rel() { -entry: - fence syncscope("agent-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; CACHE_INV: buffer_wbinvl1{{$}} -; CACHE_INV10: buffer_gl0_inv -; CACHE_INV10: buffer_gl1_inv -; SKIP_CACHE_INV-NOT: buffer_wbinvl1{{$}} -; SKIP_CACHE_INV-NOT: buffer_gl -; GCN: s_endpgm -define amdgpu_kernel void @agent_one_as_seq_cst() { -entry: - fence syncscope("agent-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}workgroup_acquire: -; GCN: %bb.0 -; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_acquire() { -entry: - fence syncscope("workgroup") acquire - ret void -} - -; FUNC-LABEL: {{^}}workgroup_release: -; GCN: %bb.0 -; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_release() { -entry: - fence syncscope("workgroup") release - ret void -} - -; FUNC-LABEL: {{^}}workgroup_acq_rel: -; GCN: %bb.0 -; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_acq_rel() { -entry: - fence syncscope("workgroup") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}workgroup_seq_cst: -; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_seq_cst() { -entry: - fence syncscope("workgroup") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_acquire: -; GCN: %bb.0 -; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt vmcnt(0) -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_one_as_acquire() { -entry: - fence syncscope("workgroup-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_release: -; GCN: %bb.0 -; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt vmcnt(0) -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_one_as_release() { -entry: - fence syncscope("workgroup-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: -; GCN: %bb.0 -; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt vmcnt(0) -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_one_as_acq_rel() { -entry: - fence syncscope("workgroup-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: -; GCN: %bb.0 -; GCN9-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN10: s_waitcnt vmcnt(0) -; GCN10: s_waitcnt_vscnt null, 0x0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_one_as_seq_cst() { -entry: - fence syncscope("workgroup-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}wavefront_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_acquire() { -entry: - fence syncscope("wavefront") acquire - ret void -} - -; FUNC-LABEL: {{^}}wavefront_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_release() { -entry: - fence syncscope("wavefront") release - ret void -} - -; FUNC-LABEL: {{^}}wavefront_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_acq_rel() { -entry: - fence syncscope("wavefront") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}wavefront_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_seq_cst() { -entry: - fence syncscope("wavefront") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_one_as_acquire() { -entry: - fence syncscope("wavefront-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_one_as_release() { -entry: - fence syncscope("wavefront-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_one_as_acq_rel() { -entry: - fence syncscope("wavefront-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_one_as_seq_cst() { -entry: - fence syncscope("wavefront-one-as") seq_cst - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-cmpxchg.ll +++ /dev/null @@ -1,3292 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s - -; GCN-LABEL: {{^}}system_one_as_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GFX10: .amdhsa_kernel system_one_as_monotonic_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acquire_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}system_one_as_release_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl._inv -; GFX10: .amdhsa_kernel system_one_as_release_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acq_rel_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acq_rel_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}system_one_as_seq_cst_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_seq_cst_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acquire_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire - ret void -} - -; GCN-LABEL: {{^}}system_one_as_release_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_release_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acq_rel_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acq_rel_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}system_one_as_seq_cst_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_seq_cst_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}system_one_as_seq_cst_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_seq_cst_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GFX10: .amdhsa_kernel singlethread_one_as_monotonic_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GFX10: .amdhsa_kernel singlethread_one_as_acquire_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; Gfx8-NOT: buffer_wbinvl1_vol -; GCN-NOT: buffer_gl{{[01]}}_inv -; GFX10: .amdhsa_kernel singlethread_one_as_release_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_one_as_acq_rel_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_one_as_acquire_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_one_as_release_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_one_as_acq_rel_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel agent_one_as_monotonic_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acquire_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_release_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel agent_one_as_release_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acq_rel_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acq_rel_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_seq_cst_monotonic: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_seq_cst_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acquire_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_release_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_release_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acq_rel_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acq_rel_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_seq_cst_acquire: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_seq_cst_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_seq_cst_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_seq_cst_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel workgroup_one_as_monotonic_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: buffer_gl0_inv -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10: .amdhsa_kernel workgroup_one_as_acquire_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_release_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel workgroup_one_as_release_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: buffer_gl0_inv -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10: .amdhsa_kernel workgroup_one_as_acquire_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_release_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_release_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_one_as_monotonic_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_one_as_acquire_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_one_as_release_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_one_as_acq_rel_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_one_as_acquire_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_one_as_release_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_one_as_acq_rel_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acquire_monotonic_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acquire_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acquire_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acq_rel_monotonic_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acq_rel_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acq_rel_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_seq_cst_monotonic_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_seq_cst_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acquire_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acquire_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acquire_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_release_acquire_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_release_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_release_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acq_rel_acquire_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acq_rel_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acq_rel_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_seq_cst_acquire_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_seq_cst_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_seq_cst_seq_cst_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_seq_cst_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst_seq_cst_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acquire_monotonic_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acquire_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acquire_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acq_rel_monotonic_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acq_rel_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acq_rel_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_seq_cst_monotonic_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_seq_cst_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acquire_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acquire_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acquire_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_release_acquire_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_release_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_release_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acq_rel_acquire_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acq_rel_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acq_rel_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_seq_cst_acquire_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_seq_cst_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_seq_cst_seq_cst_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_seq_cst_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst_seq_cst_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acquire_monotonic_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: buffer_gl0_inv -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10: .amdhsa_kernel workgroup_one_as_acquire_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acquire_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_monotonic_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acq_rel_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_monotonic_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acquire_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: buffer_gl0_inv -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10: .amdhsa_kernel workgroup_one_as_acquire_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acquire_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_release_acquire_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_release_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_release_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_acquire_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acq_rel_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_acquire_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_seq_cst_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst_seq_cst_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GFX10: .amdhsa_kernel system_monotonic_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}system_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acquire_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}system_release_monotonic: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl._inv -; GFX10: .amdhsa_kernel system_release_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic - ret void -} - -; GCN-LABEL: {{^}}system_acq_rel_monotonic: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acq_rel_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst_monotonic: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_seq_cst_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}system_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acquire_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire - ret void -} - -; GCN-LABEL: {{^}}system_release_acquire: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_release_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire - ret void -} - -; GCN-LABEL: {{^}}system_acq_rel_acquire: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acq_rel_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst_acquire: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_seq_cst_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst_seq_cst: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_seq_cst_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst - ret void -} - -; GCN-LABEL: {{^}}singlethread_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GFX10: .amdhsa_kernel singlethread_monotonic_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GFX10: .amdhsa_kernel singlethread_acquire_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; Gfx8-NOT: buffer_wbinvl1_vol -; GCN-NOT: buffer_gl{{[01]}}_inv -; GFX10: .amdhsa_kernel singlethread_release_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_acq_rel_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_seq_cst_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_acquire_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_release_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_acq_rel_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_seq_cst_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel singlethread_seq_cst_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst - ret void -} - -; GCN-LABEL: {{^}}agent_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel agent_monotonic_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acquire_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_release_monotonic: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel agent_release_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_acq_rel_monotonic: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acq_rel_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_seq_cst_monotonic: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_seq_cst_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acquire_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire - ret void -} - -; GCN-LABEL: {{^}}agent_release_acquire: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_release_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire - ret void -} - -; GCN-LABEL: {{^}}agent_acq_rel_acquire: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acq_rel_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}agent_seq_cst_acquire: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_seq_cst_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}agent_seq_cst_seq_cst: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_seq_cst_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst - ret void -} - -; GCN-LABEL: {{^}}workgroup_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel workgroup_monotonic_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: buffer_gl0_inv -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10: .amdhsa_kernel workgroup_acquire_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_release_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel workgroup_release_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_acq_rel_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_acq_rel_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_seq_cst_monotonic: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_seq_cst_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: buffer_gl0_inv -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10: .amdhsa_kernel workgroup_acquire_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire - ret void -} - -; GCN-LABEL: {{^}}workgroup_release_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_release_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire - ret void -} - -; GCN-LABEL: {{^}}workgroup_acq_rel_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_acq_rel_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}workgroup_seq_cst_acquire: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_seq_cst_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}workgroup_seq_cst_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_seq_cst_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst - ret void -} - -; GCN-LABEL: {{^}}wavefront_monotonic_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_monotonic_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_monotonic_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_acquire_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_acquire_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_acquire_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_release_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_release_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_release_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_acq_rel_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_acq_rel_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_acq_rel_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_seq_cst_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_seq_cst_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_seq_cst_monotonic( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_acquire_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_acquire_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_acquire_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire - ret void -} - -; GCN-LABEL: {{^}}wavefront_release_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_release_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_release_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire - ret void -} - -; GCN-LABEL: {{^}}wavefront_acq_rel_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_acq_rel_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_acq_rel_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire - ret void -} - -; GCN-LABEL: {{^}}wavefront_seq_cst_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_seq_cst_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_seq_cst_acquire( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire - ret void -} - -; GCN-LABEL: {{^}}wavefront_seq_cst_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}._inv -; GFX10: .amdhsa_kernel wavefront_seq_cst_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_seq_cst_seq_cst( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst - ret void -} - -; GCN-LABEL: {{^}}system_acquire_monotonic_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acquire_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acquire_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_acq_rel_monotonic_ret: -; GCN: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acq_rel_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acq_rel_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst_monotonic_ret: -; GCN: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_seq_cst_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_acquire_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acquire_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acquire_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_release_acquire_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_release_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_release_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_acq_rel_acquire_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acq_rel_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acq_rel_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst_acquire_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_seq_cst_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst_seq_cst_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_seq_cst_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst_seq_cst_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_acquire_monotonic_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acquire_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acquire_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_acq_rel_monotonic_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acq_rel_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acq_rel_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_seq_cst_monotonic_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_seq_cst_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_acquire_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acquire_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acquire_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_release_acquire_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_release_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_release_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_acq_rel_acquire_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acq_rel_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acq_rel_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_seq_cst_acquire_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_seq_cst_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_seq_cst_seq_cst_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_seq_cst_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst_seq_cst_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_acquire_monotonic_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: buffer_gl0_inv -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10: .amdhsa_kernel workgroup_acquire_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acquire_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_acq_rel_monotonic_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_acq_rel_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acq_rel_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_seq_cst_monotonic_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_seq_cst_monotonic_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst_monotonic_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_acquire_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: buffer_gl0_inv -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10: .amdhsa_kernel workgroup_acquire_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acquire_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_release_acquire_ret: -; GFX8: s_waitcnt lgkmcnt(0){{$}} -; GFX8: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_release_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_release_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_acq_rel_acquire_ret: -; GFX8: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_acq_rel_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acq_rel_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_seq_cst_acquire_ret: -; GFX8: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_seq_cst_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst_acquire_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_seq_cst_seq_cst_ret: -; GFX8: s_waitcnt lgkmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{( offset:[0-9]+)*}} glc{{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_seq_cst_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst_seq_cst_ret( - i32* %out, i32 %in, i32 %old) { -entry: - %gep = getelementptr i32, i32* %out, i32 4 - %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst - %val0 = extractvalue { i32, i1 } %val, 0 - store i32 %val0, i32* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-fence.ll +++ /dev/null @@ -1,719 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX6,GFX68 %s -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX8,GFX68 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX10,GFX10CU %s - -; FUNC-LABEL: {{^}}system_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0){{$}} -; GFX6-NEXT: buffer_wbinvl1{{$}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol{{$}} -; GFX10: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel system_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acquire() { -entry: - fence syncscope("one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}system_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel system_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_release() { -entry: - fence syncscope("one-as") release - ret void -} - -; FUNC-LABEL: {{^}}system_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX6: buffer_wbinvl1{{$}} -; GFX8: buffer_wbinvl1_vol{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel system_one_as_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acq_rel() { -entry: - fence syncscope("one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}system_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX6: buffer_wbinvl1{{$}} -; GFX8: buffer_wbinvl1_vol{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel system_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst() { -entry: - fence syncscope("one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel singlethread_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_acquire() { -entry: - fence syncscope("singlethread-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel singlethread_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_release() { -entry: - fence syncscope("singlethread-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel singlethread_one_as_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_acq_rel() { -entry: - fence syncscope("singlethread-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_seq_cst() { -entry: - fence syncscope("singlethread-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0){{$}} -; GFX6-NEXT: buffer_wbinvl1{{$}} -; GFX8: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol{{$}} -; GFX10: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel agent_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acquire() { -entry: - fence syncscope("agent-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel agent_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_release() { -entry: - fence syncscope("agent-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX6: buffer_wbinvl1{{$}} -; GFX8: buffer_wbinvl1_vol{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel agent_one_as_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acq_rel() { -entry: - fence syncscope("agent-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX6: buffer_wbinvl1{{$}} -; GFX8: buffer_wbinvl1_vol{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel agent_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst() { -entry: - fence syncscope("agent-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_acquire: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10WGP-NEXT: buffer_gl0_inv{{$}} -; GFX10CU-NOT: buffer_gl0_inv{{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel workgroup_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acquire() { -entry: - fence syncscope("workgroup-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_release: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10-NOT: buffer_gl0_inv -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel workgroup_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_release() { -entry: - fence syncscope("workgroup-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10WGP-NEXT: buffer_gl0_inv{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: buffer_gl0_inv{{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acq_rel() { -entry: - fence syncscope("workgroup-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10WGP-NEXT: buffer_gl0_inv{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: buffer_gl0_inv{{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst() { -entry: - fence syncscope("workgroup-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel wavefront_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_acquire() { -entry: - fence syncscope("wavefront-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel wavefront_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_release() { -entry: - fence syncscope("wavefront-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel wavefront_one_as_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_acq_rel() { -entry: - fence syncscope("wavefront-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_seq_cst() { -entry: - fence syncscope("wavefront-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}system_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX6-NEXT: buffer_wbinvl1{{$}} -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol{{$}} -; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel system_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acquire() { -entry: - fence acquire - ret void -} - -; FUNC-LABEL: {{^}}system_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel system_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_release() { -entry: - fence release - ret void -} - -; FUNC-LABEL: {{^}}system_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX6: buffer_wbinvl1{{$}} -; GFX8: buffer_wbinvl1_vol{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel system_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acq_rel() { -entry: - fence acq_rel - ret void -} - -; FUNC-LABEL: {{^}}system_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX6: buffer_wbinvl1{{$}} -; GFX8: buffer_wbinvl1_vol{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel system_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst() { -entry: - fence seq_cst - ret void -} - -; FUNC-LABEL: {{^}}singlethread_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel singlethread_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_acquire() { -entry: - fence syncscope("singlethread") acquire - ret void -} - -; FUNC-LABEL: {{^}}singlethread_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel singlethread_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_release() { -entry: - fence syncscope("singlethread") release - ret void -} - -; FUNC-LABEL: {{^}}singlethread_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel singlethread_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_acq_rel() { -entry: - fence syncscope("singlethread") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}singlethread_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel singlethread_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_seq_cst() { -entry: - fence syncscope("singlethread") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}agent_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX6-NEXT: buffer_wbinvl1{{$}} -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol{{$}} -; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel agent_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acquire() { -entry: - fence syncscope("agent") acquire - ret void -} - -; FUNC-LABEL: {{^}}agent_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel agent_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_release() { -entry: - fence syncscope("agent") release - ret void -} - -; FUNC-LABEL: {{^}}agent_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX6: buffer_wbinvl1{{$}} -; GFX8: buffer_wbinvl1_vol{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel agent_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acq_rel() { -entry: - fence syncscope("agent") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}agent_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GFX6: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX6: buffer_wbinvl1{{$}} -; GFX8: buffer_wbinvl1_vol{{$}} -; GFX10-NEXT: buffer_gl0_inv{{$}} -; GFX10-NEXT: buffer_gl1_inv{{$}} -; GCN: s_endpgm -; GFX10: .amdhsa_kernel agent_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst() { -entry: - fence syncscope("agent") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}workgroup_acquire: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10WGP-NEXT: buffer_gl0_inv{{$}} -; GFX10CU-NOT: buffer_gl0_inv{{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel workgroup_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acquire() { -entry: - fence syncscope("workgroup") acquire - ret void -} - -; FUNC-LABEL: {{^}}workgroup_release: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10-NOT: buffer_gl0_inv -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel workgroup_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_release() { -entry: - fence syncscope("workgroup") release - ret void -} - -; FUNC-LABEL: {{^}}workgroup_acq_rel: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10WGP-NEXT: buffer_gl0_inv{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: buffer_gl0_inv{{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel workgroup_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acq_rel() { -entry: - fence syncscope("workgroup") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}workgroup_seq_cst: -; GCN: %bb.0 -; GFX68-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10WGP-NEXT: buffer_gl0_inv{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: buffer_gl0_inv{{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel workgroup_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst() { -entry: - fence syncscope("workgroup") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}wavefront_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel wavefront_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_acquire() { -entry: - fence syncscope("wavefront") acquire - ret void -} - -; FUNC-LABEL: {{^}}wavefront_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel wavefront_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_release() { -entry: - fence syncscope("wavefront") release - ret void -} - -; FUNC-LABEL: {{^}}wavefront_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel wavefront_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_acq_rel() { -entry: - fence syncscope("wavefront") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}wavefront_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -; GFX10: .amdhsa_kernel wavefront_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_seq_cst() { -entry: - fence syncscope("wavefront") seq_cst - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir +++ /dev/null @@ -1,111 +0,0 @@ -# RUN: llc -march=amdgcn -run-pass=si-memory-legalizer %s -o - | FileCheck %s - ---- | - declare i32 @llvm.amdgcn.workitem.id.x() #0 - - define amdgpu_kernel void @atomic_max_i32_noret( - i32 addrspace(1)* %out, - i32 addrspace(1)* addrspace(1)* %in, - i32 addrspace(1)* %x, - i32 %y) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %idxprom = sext i32 %tid to i64 - %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i64 %idxprom - %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep - %xor = xor i32 %tid, 1 - %cmp = icmp ne i32 %xor, 0 - %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %cmp) - %2 = extractvalue { i1, i64 } %1, 0 - %3 = extractvalue { i1, i64 } %1, 1 - br i1 %2, label %atomic, label %exit - - atomic: ; preds = %0 - %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 100 - %ret = atomicrmw max i32 addrspace(1)* %gep, i32 %y seq_cst - br label %exit - - exit: ; preds = %atomic, %0 - call void @llvm.amdgcn.end.cf(i64 %3) - ret void - } - - declare { i1, i64 } @llvm.amdgcn.if(i1) - - declare void @llvm.amdgcn.end.cf(i64) - - attributes #0 = { nounwind readnone } - attributes #1 = { nounwind "target-cpu"="gfx803" } - -... ---- - -# CHECK-LABEL: name: atomic_max_i32_noret - -# CHECK-LABEL: bb.1.atomic: -# CHECK: BUFFER_ATOMIC_SMAX_ADDR64 -# CHECK-NEXT: S_WAITCNT 3952 -# CHECK-NEXT: BUFFER_WBINVL1_VOL - -name: atomic_max_i32_noret -alignment: 1 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true -liveins: - - { reg: '$sgpr0_sgpr1' } - - { reg: '$vgpr0' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 0 - offsetAdjustment: 0 - maxAlignment: 0 - adjustsStack: false - hasCalls: false - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false -body: | - bb.0 (%ir-block.0): - successors: %bb.1.atomic(0x40000000), %bb.2.exit(0x40000000) - liveins: $vgpr0, $sgpr0_sgpr1 - - $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 11, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) - $vgpr1 = V_ASHRREV_I32_e32 31, $vgpr0, implicit $exec - $vgpr1_vgpr2 = V_LSHL_B64 $vgpr0_vgpr1, 3, implicit $exec - $sgpr7 = S_MOV_B32 61440 - $sgpr6 = S_MOV_B32 0 - S_WAITCNT 127 - $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep) - $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec - V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec - $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - $sgpr2_sgpr3 = S_XOR_B64 $exec, killed $sgpr2_sgpr3, implicit-def dead $scc - SI_MASK_BRANCH %bb.2.exit, implicit $exec - - bb.1.atomic: - successors: %bb.2.exit(0x80000000) - liveins: $sgpr4_sgpr5_sgpr6_sgpr7:0x0000000C, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr1_vgpr2_vgpr3_vgpr4:0x00000003 - - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 15, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) - dead $vgpr0 = V_MOV_B32_e32 -1, implicit $exec - dead $vgpr0 = V_MOV_B32_e32 61440, implicit $exec - $sgpr4_sgpr5 = S_MOV_B64 0 - S_WAITCNT 127 - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - S_WAITCNT 3952 - BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from %ir.gep) - - bb.2.exit: - liveins: $sgpr2_sgpr3 - - $exec = S_OR_B64 $exec, killed $sgpr2_sgpr3, implicit-def $scc - S_ENDPGM 0 - -... - diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-rmw.ll +++ /dev/null @@ -1,1370 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX10CU %s - -; GCN-LABEL: {{^}}system_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel system_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_monotonic( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acquire( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire - ret void -} - -; GCN-LABEL: {{^}}system_one_as_release: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel system_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_release( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acq_rel: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acq_rel( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel - ret void -} - -; GCN-LABEL: {{^}}system_one_as_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel singlethread_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_monotonic( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel singlethread_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_acquire( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel singlethread_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_release( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_acq_rel: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel singlethread_one_as_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_acq_rel( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_seq_cst( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel agent_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_monotonic( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acquire( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_release: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel agent_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_release( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acq_rel: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acq_rel( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel workgroup_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_monotonic( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acquire( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_release: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel workgroup_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_release( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acq_rel: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acq_rel( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel wavefront_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_monotonic( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel wavefront_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_acquire( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel wavefront_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_release( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_acq_rel: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel wavefront_one_as_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_acq_rel( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_seq_cst( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acquire_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acq_rel_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_acq_rel_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acq_rel_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_seq_cst_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_one_as_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acquire_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acq_rel_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_acq_rel_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acq_rel_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_seq_cst_ret: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_one_as_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acquire_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acq_rel_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_acq_rel_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acq_rel_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel system_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_monotonic( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in monotonic - ret void -} - -; GCN-LABEL: {{^}}system_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acquire( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in acquire - ret void -} - -; GCN-LABEL: {{^}}system_release: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel system_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_release( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in release - ret void -} - -; GCN-LABEL: {{^}}system_acq_rel: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acq_rel( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst - ret void -} - -; GCN-LABEL: {{^}}singlethread_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel singlethread_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_monotonic( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic - ret void -} - -; GCN-LABEL: {{^}}singlethread_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel singlethread_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_acquire( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire - ret void -} - -; GCN-LABEL: {{^}}singlethread_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel singlethread_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_release( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release - ret void -} - -; GCN-LABEL: {{^}}singlethread_acq_rel: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel singlethread_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_acq_rel( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel - ret void -} - -; GCN-LABEL: {{^}}singlethread_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel singlethread_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_seq_cst( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst - ret void -} - -; GCN-LABEL: {{^}}agent_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel agent_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_monotonic( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") monotonic - ret void -} - -; GCN-LABEL: {{^}}agent_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acquire( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire - ret void -} - -; GCN-LABEL: {{^}}agent_release: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel agent_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_release( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") release - ret void -} - -; GCN-LABEL: {{^}}agent_acq_rel: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acq_rel( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel - ret void -} - -; GCN-LABEL: {{^}}agent_seq_cst: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt lgkmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst - ret void -} - -; GCN-LABEL: {{^}}workgroup_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel workgroup_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_monotonic( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") monotonic - ret void -} - -; GCN-LABEL: {{^}}workgroup_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acquire( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire - ret void -} - -; GCN-LABEL: {{^}}workgroup_release: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel workgroup_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_release( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") release - ret void -} - -; GCN-LABEL: {{^}}workgroup_acq_rel: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acq_rel( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel - ret void -} - -; GCN-LABEL: {{^}}workgroup_seq_cst: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst - ret void -} - -; GCN-LABEL: {{^}}wavefront_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel wavefront_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_monotonic( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic - ret void -} - -; GCN-LABEL: {{^}}wavefront_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel wavefront_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_acquire( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire - ret void -} - -; GCN-LABEL: {{^}}wavefront_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel wavefront_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_release( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release - ret void -} - -; GCN-LABEL: {{^}}wavefront_acq_rel: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel wavefront_acq_rel -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_acq_rel( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel - ret void -} - -; GCN-LABEL: {{^}}wavefront_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN-NOT: buffer_{{wbinvl1_vol|gl._inv}} -; GFX10: .amdhsa_kernel wavefront_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_seq_cst( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst - ret void -} - -; GCN-LABEL: {{^}}system_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acquire_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in acquire - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_acq_rel_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_acq_rel_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acq_rel_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel system_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acquire_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_acq_rel_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_acq_rel_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acq_rel_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_seq_cst_ret: -; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX8-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GFX10: .amdhsa_kernel agent_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_acquire_ret: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_acquire_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acquire_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_acq_rel_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_acq_rel_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acq_rel_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel - store i32 %val, i32* %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_seq_cst_ret: -; GFX8-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX8-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GFX10: .amdhsa_kernel workgroup_seq_cst_ret -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst_ret( - i32* %out, i32 %in) { -entry: - %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst - store i32 %val, i32* %out, align 4 - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fences-amdpal-mesa3d.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fences-amdpal-mesa3d.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fences-amdpal-mesa3d.ll @@ -0,0 +1,10495 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=PAL-GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=PAL-GFX7-SKIP_CACHE_INV %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=PAL-GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=PAL-GFX10-WGP-SKIP_CACHE_INV %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=PAL-GFX10-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -amdgcn-skip-cache-invalidations -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=PAL-GFX10-CU-SKIP_CACHE_INV %s +; RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=MESA-GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=MESA-GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=MESA-GFX10-CU %s + + +define amdgpu_kernel void @system_acquire() { +; PAL-GFX7-LABEL: system_acquire: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: system_acquire: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: system_acquire: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: system_acquire: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: system_acquire: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: system_acquire: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: system_acquire: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: system_acquire: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: system_acquire: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence acquire + ret void +} + +define amdgpu_kernel void @system_release() { +; PAL-GFX7-LABEL: system_release: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: system_release: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: system_release: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: system_release: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: system_release: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: system_release: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: system_release: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: system_release: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: system_release: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence release + ret void +} + +define amdgpu_kernel void @system_acq_rel() { +; PAL-GFX7-LABEL: system_acq_rel: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: system_acq_rel: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: system_acq_rel: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: system_acq_rel: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: system_acq_rel: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: system_acq_rel: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: system_acq_rel: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: system_acq_rel: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: system_acq_rel: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence acq_rel + ret void +} + +define amdgpu_kernel void @system_seq_cst() { +; PAL-GFX7-LABEL: system_seq_cst: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: system_seq_cst: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: system_seq_cst: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: system_seq_cst: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: system_seq_cst: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: system_seq_cst: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: system_seq_cst: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: system_seq_cst: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: system_seq_cst: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence seq_cst + ret void +} + +define amdgpu_kernel void @system_one_as_acquire() { +; PAL-GFX7-LABEL: system_one_as_acquire: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: system_one_as_acquire: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: system_one_as_acquire: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: system_one_as_acquire: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: system_one_as_acquire: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: system_one_as_acquire: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: system_one_as_acquire: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: system_one_as_acquire: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: system_one_as_acquire: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @system_one_as_release() { +; PAL-GFX7-LABEL: system_one_as_release: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: system_one_as_release: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: system_one_as_release: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: system_one_as_release: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: system_one_as_release: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: system_one_as_release: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: system_one_as_release: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: system_one_as_release: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: system_one_as_release: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @system_one_as_acq_rel() { +; PAL-GFX7-LABEL: system_one_as_acq_rel: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: system_one_as_acq_rel: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: system_one_as_acq_rel: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: system_one_as_acq_rel: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: system_one_as_acq_rel: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: system_one_as_acq_rel: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: system_one_as_acq_rel: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: system_one_as_acq_rel: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: system_one_as_acq_rel: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @system_one_as_seq_cst() { +; PAL-GFX7-LABEL: system_one_as_seq_cst: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: system_one_as_seq_cst: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: system_one_as_seq_cst: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: system_one_as_seq_cst: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: system_one_as_seq_cst: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: system_one_as_seq_cst: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: system_one_as_seq_cst: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: system_one_as_seq_cst: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: system_one_as_seq_cst: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("one-as") seq_cst + ret void +} + +define amdgpu_kernel void @singlethread_acquire() { +; PAL-GFX7-LABEL: singlethread_acquire: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: singlethread_acquire: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: singlethread_acquire: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: singlethread_acquire: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: singlethread_acquire: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: singlethread_acquire: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: singlethread_acquire: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: singlethread_acquire: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: singlethread_acquire: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread") acquire + ret void +} + +define amdgpu_kernel void @singlethread_release() { +; PAL-GFX7-LABEL: singlethread_release: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: singlethread_release: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: singlethread_release: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: singlethread_release: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: singlethread_release: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: singlethread_release: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: singlethread_release: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: singlethread_release: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: singlethread_release: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread") release + ret void +} + +define amdgpu_kernel void @singlethread_acq_rel() { +; PAL-GFX7-LABEL: singlethread_acq_rel: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: singlethread_acq_rel: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: singlethread_acq_rel: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: singlethread_acq_rel: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: singlethread_acq_rel: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: singlethread_acq_rel: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: singlethread_acq_rel: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: singlethread_acq_rel: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: singlethread_acq_rel: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread") acq_rel + ret void +} + +define amdgpu_kernel void @singlethread_seq_cst() { +; PAL-GFX7-LABEL: singlethread_seq_cst: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: singlethread_seq_cst: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: singlethread_seq_cst: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: singlethread_seq_cst: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: singlethread_seq_cst: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: singlethread_seq_cst: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: singlethread_seq_cst: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: singlethread_seq_cst: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: singlethread_seq_cst: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread") seq_cst + ret void +} + +define amdgpu_kernel void @singlethread_one_as_acquire() { +; PAL-GFX7-LABEL: singlethread_one_as_acquire: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: singlethread_one_as_acquire: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: singlethread_one_as_acquire: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: singlethread_one_as_acquire: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: singlethread_one_as_acquire: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: singlethread_one_as_acquire: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: singlethread_one_as_acquire: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: singlethread_one_as_acquire: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: singlethread_one_as_acquire: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @singlethread_one_as_release() { +; PAL-GFX7-LABEL: singlethread_one_as_release: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: singlethread_one_as_release: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: singlethread_one_as_release: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: singlethread_one_as_release: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: singlethread_one_as_release: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: singlethread_one_as_release: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: singlethread_one_as_release: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: singlethread_one_as_release: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: singlethread_one_as_release: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @singlethread_one_as_acq_rel() { +; PAL-GFX7-LABEL: singlethread_one_as_acq_rel: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: singlethread_one_as_acq_rel: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: singlethread_one_as_acq_rel: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: singlethread_one_as_acq_rel: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: singlethread_one_as_acq_rel: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: singlethread_one_as_acq_rel: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: singlethread_one_as_acq_rel: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: singlethread_one_as_acq_rel: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: singlethread_one_as_acq_rel: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @singlethread_one_as_seq_cst() { +; PAL-GFX7-LABEL: singlethread_one_as_seq_cst: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: singlethread_one_as_seq_cst: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: singlethread_one_as_seq_cst: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: singlethread_one_as_seq_cst: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: singlethread_one_as_seq_cst: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: singlethread_one_as_seq_cst: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: singlethread_one_as_seq_cst: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: singlethread_one_as_seq_cst: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: singlethread_one_as_seq_cst: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @agent_acquire() { +; PAL-GFX7-LABEL: agent_acquire: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: agent_acquire: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: agent_acquire: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: agent_acquire: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: agent_acquire: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: agent_acquire: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: agent_acquire: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: agent_acquire: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: agent_acquire: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent") acquire + ret void +} + +define amdgpu_kernel void @agent_release() { +; PAL-GFX7-LABEL: agent_release: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: agent_release: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: agent_release: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: agent_release: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: agent_release: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: agent_release: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: agent_release: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: agent_release: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: agent_release: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent") release + ret void +} + +define amdgpu_kernel void @agent_acq_rel() { +; PAL-GFX7-LABEL: agent_acq_rel: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: agent_acq_rel: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: agent_acq_rel: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: agent_acq_rel: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: agent_acq_rel: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: agent_acq_rel: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: agent_acq_rel: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: agent_acq_rel: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: agent_acq_rel: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent") acq_rel + ret void +} + +define amdgpu_kernel void @agent_seq_cst() { +; PAL-GFX7-LABEL: agent_seq_cst: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: agent_seq_cst: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: agent_seq_cst: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: agent_seq_cst: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: agent_seq_cst: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: agent_seq_cst: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: agent_seq_cst: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: agent_seq_cst: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: agent_seq_cst: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @agent_one_as_acquire() { +; PAL-GFX7-LABEL: agent_one_as_acquire: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: agent_one_as_acquire: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: agent_one_as_acquire: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: agent_one_as_acquire: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: agent_one_as_acquire: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: agent_one_as_acquire: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: agent_one_as_acquire: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: agent_one_as_acquire: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: agent_one_as_acquire: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @agent_one_as_release() { +; PAL-GFX7-LABEL: agent_one_as_release: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: agent_one_as_release: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: agent_one_as_release: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: agent_one_as_release: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: agent_one_as_release: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: agent_one_as_release: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: agent_one_as_release: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: agent_one_as_release: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: agent_one_as_release: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @agent_one_as_acq_rel() { +; PAL-GFX7-LABEL: agent_one_as_acq_rel: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: agent_one_as_acq_rel: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: agent_one_as_acq_rel: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: agent_one_as_acq_rel: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: agent_one_as_acq_rel: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: agent_one_as_acq_rel: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: agent_one_as_acq_rel: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: agent_one_as_acq_rel: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: agent_one_as_acq_rel: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @agent_one_as_seq_cst() { +; PAL-GFX7-LABEL: agent_one_as_seq_cst: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-NEXT: buffer_wbinvl1 +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: agent_one_as_seq_cst: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: agent_one_as_seq_cst: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: buffer_gl1_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: agent_one_as_seq_cst: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: agent_one_as_seq_cst: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-NEXT: buffer_gl0_inv +; PAL-GFX10-CU-NEXT: buffer_gl1_inv +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: agent_one_as_seq_cst: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: agent_one_as_seq_cst: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX7-NEXT: buffer_wbinvl1 +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: agent_one_as_seq_cst: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: buffer_gl1_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: agent_one_as_seq_cst: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-CU-NEXT: buffer_gl0_inv +; MESA-GFX10-CU-NEXT: buffer_gl1_inv +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @workgroup_acquire() { +; PAL-GFX7-LABEL: workgroup_acquire: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: workgroup_acquire: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: workgroup_acquire: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: workgroup_acquire: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: workgroup_acquire: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: workgroup_acquire: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: workgroup_acquire: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt lgkmcnt(0) +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: workgroup_acquire: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: workgroup_acquire: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup") acquire + ret void +} + +define amdgpu_kernel void @workgroup_release() { +; PAL-GFX7-LABEL: workgroup_release: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: workgroup_release: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: workgroup_release: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: workgroup_release: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: workgroup_release: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: workgroup_release: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: workgroup_release: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt lgkmcnt(0) +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: workgroup_release: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: workgroup_release: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup") release + ret void +} + +define amdgpu_kernel void @workgroup_acq_rel() { +; PAL-GFX7-LABEL: workgroup_acq_rel: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: workgroup_acq_rel: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: workgroup_acq_rel: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: workgroup_acq_rel: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: workgroup_acq_rel: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: workgroup_acq_rel: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: workgroup_acq_rel: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt lgkmcnt(0) +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: workgroup_acq_rel: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: workgroup_acq_rel: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @workgroup_seq_cst() { +; PAL-GFX7-LABEL: workgroup_seq_cst: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: workgroup_seq_cst: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: workgroup_seq_cst: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: workgroup_seq_cst: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: workgroup_seq_cst: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: workgroup_seq_cst: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_waitcnt lgkmcnt(0) +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: workgroup_seq_cst: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_waitcnt lgkmcnt(0) +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: workgroup_seq_cst: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: workgroup_seq_cst: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @workgroup_one_as_acquire() { +; PAL-GFX7-LABEL: workgroup_one_as_acquire: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: workgroup_one_as_acquire: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: workgroup_one_as_acquire: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: workgroup_one_as_acquire: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: workgroup_one_as_acquire: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: workgroup_one_as_acquire: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: workgroup_one_as_acquire: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: workgroup_one_as_acquire: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: workgroup_one_as_acquire: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @workgroup_one_as_release() { +; PAL-GFX7-LABEL: workgroup_one_as_release: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: workgroup_one_as_release: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: workgroup_one_as_release: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: workgroup_one_as_release: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: workgroup_one_as_release: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: workgroup_one_as_release: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: workgroup_one_as_release: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: workgroup_one_as_release: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: workgroup_one_as_release: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @workgroup_one_as_acq_rel() { +; PAL-GFX7-LABEL: workgroup_one_as_acq_rel: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: workgroup_one_as_acq_rel: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: workgroup_one_as_acq_rel: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: workgroup_one_as_acq_rel: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: workgroup_one_as_acq_rel: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: workgroup_one_as_acq_rel: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: workgroup_one_as_acq_rel: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: workgroup_one_as_acq_rel: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: workgroup_one_as_acq_rel: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @workgroup_one_as_seq_cst() { +; PAL-GFX7-LABEL: workgroup_one_as_seq_cst: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: workgroup_one_as_seq_cst: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: workgroup_one_as_seq_cst: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-NEXT: buffer_gl0_inv +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: workgroup_one_as_seq_cst: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt vmcnt(0) +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_waitcnt_vscnt null, 0x0 +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: workgroup_one_as_seq_cst: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: workgroup_one_as_seq_cst: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: workgroup_one_as_seq_cst: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: workgroup_one_as_seq_cst: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; MESA-GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; MESA-GFX10-WGP-NEXT: buffer_gl0_inv +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: workgroup_one_as_seq_cst: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @wavefront_acquire() { +; PAL-GFX7-LABEL: wavefront_acquire: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: wavefront_acquire: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: wavefront_acquire: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: wavefront_acquire: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: wavefront_acquire: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: wavefront_acquire: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: wavefront_acquire: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: wavefront_acquire: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: wavefront_acquire: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront") acquire + ret void +} + +define amdgpu_kernel void @wavefront_release() { +; PAL-GFX7-LABEL: wavefront_release: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: wavefront_release: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: wavefront_release: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: wavefront_release: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: wavefront_release: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: wavefront_release: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: wavefront_release: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: wavefront_release: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: wavefront_release: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront") release + ret void +} + +define amdgpu_kernel void @wavefront_acq_rel() { +; PAL-GFX7-LABEL: wavefront_acq_rel: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: wavefront_acq_rel: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: wavefront_acq_rel: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: wavefront_acq_rel: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: wavefront_acq_rel: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: wavefront_acq_rel: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: wavefront_acq_rel: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: wavefront_acq_rel: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: wavefront_acq_rel: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront") acq_rel + ret void +} + +define amdgpu_kernel void @wavefront_seq_cst() { +; PAL-GFX7-LABEL: wavefront_seq_cst: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: wavefront_seq_cst: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: wavefront_seq_cst: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: wavefront_seq_cst: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: wavefront_seq_cst: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: wavefront_seq_cst: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: wavefront_seq_cst: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: wavefront_seq_cst: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: wavefront_seq_cst: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront") seq_cst + ret void +} + +define amdgpu_kernel void @wavefront_one_as_acquire() { +; PAL-GFX7-LABEL: wavefront_one_as_acquire: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: wavefront_one_as_acquire: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: wavefront_one_as_acquire: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: wavefront_one_as_acquire: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: wavefront_one_as_acquire: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: wavefront_one_as_acquire: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: wavefront_one_as_acquire: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: wavefront_one_as_acquire: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: wavefront_one_as_acquire: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @wavefront_one_as_release() { +; PAL-GFX7-LABEL: wavefront_one_as_release: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: wavefront_one_as_release: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: wavefront_one_as_release: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: wavefront_one_as_release: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: wavefront_one_as_release: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: wavefront_one_as_release: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: wavefront_one_as_release: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: wavefront_one_as_release: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: wavefront_one_as_release: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @wavefront_one_as_acq_rel() { +; PAL-GFX7-LABEL: wavefront_one_as_acq_rel: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: wavefront_one_as_acq_rel: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: wavefront_one_as_acq_rel: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: wavefront_one_as_acq_rel: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: wavefront_one_as_acq_rel: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: wavefront_one_as_acq_rel: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: wavefront_one_as_acq_rel: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: wavefront_one_as_acq_rel: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: wavefront_one_as_acq_rel: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @wavefront_one_as_seq_cst() { +; PAL-GFX7-LABEL: wavefront_one_as_seq_cst: +; PAL-GFX7: ; %bb.0: ; %entry +; PAL-GFX7-NEXT: s_endpgm +; +; PAL-GFX7-SKIP_CACHE_INV-LABEL: wavefront_one_as_seq_cst: +; PAL-GFX7-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX7-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-WGP-LABEL: wavefront_one_as_seq_cst: +; PAL-GFX10-WGP: ; %bb.0: ; %entry +; PAL-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-NEXT: s_endpgm +; +; PAL-GFX10-WGP-SKIP_CACHE_INV-LABEL: wavefront_one_as_seq_cst: +; PAL-GFX10-WGP-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-WGP-SKIP_CACHE_INV-NEXT: s_endpgm +; +; PAL-GFX10-CU-LABEL: wavefront_one_as_seq_cst: +; PAL-GFX10-CU: ; %bb.0: ; %entry +; PAL-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-NEXT: s_endpgm +; +; PAL-GFX10-CU-SKIP_CACHE_INV-LABEL: wavefront_one_as_seq_cst: +; PAL-GFX10-CU-SKIP_CACHE_INV: ; %bb.0: ; %entry +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: ; implicit-def: $vcc_hi +; PAL-GFX10-CU-SKIP_CACHE_INV-NEXT: s_endpgm +; +; MESA-GFX7-LABEL: wavefront_one_as_seq_cst: +; MESA-GFX7: .amd_kernel_code_t +; MESA-GFX7-NEXT: amd_code_version_major = 1 +; MESA-GFX7-NEXT: amd_code_version_minor = 2 +; MESA-GFX7-NEXT: amd_machine_kind = 1 +; MESA-GFX7-NEXT: amd_machine_version_major = 7 +; MESA-GFX7-NEXT: amd_machine_version_minor = 0 +; MESA-GFX7-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX7-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX7-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX7-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: priority = 0 +; MESA-GFX7-NEXT: float_mode = 240 +; MESA-GFX7-NEXT: priv = 0 +; MESA-GFX7-NEXT: enable_dx10_clamp = 1 +; MESA-GFX7-NEXT: debug_mode = 0 +; MESA-GFX7-NEXT: enable_ieee_mode = 1 +; MESA-GFX7-NEXT: enable_wgp_mode = 0 +; MESA-GFX7-NEXT: enable_mem_ordered = 0 +; MESA-GFX7-NEXT: enable_fwd_progress = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX7-NEXT: user_sgpr_count = 4 +; MESA-GFX7-NEXT: enable_trap_handler = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX7-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX7-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX7-NEXT: enable_exception_msb = 0 +; MESA-GFX7-NEXT: granulated_lds_size = 0 +; MESA-GFX7-NEXT: enable_exception = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX7-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX7-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX7-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX7-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX7-NEXT: enable_wavefront_size32 = 0 +; MESA-GFX7-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX7-NEXT: private_element_size = 1 +; MESA-GFX7-NEXT: is_ptr64 = 1 +; MESA-GFX7-NEXT: is_dynamic_callstack = 0 +; MESA-GFX7-NEXT: is_debug_enabled = 0 +; MESA-GFX7-NEXT: is_xnack_enabled = 0 +; MESA-GFX7-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX7-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX7-NEXT: gds_segment_byte_size = 0 +; MESA-GFX7-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX7-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX7-NEXT: wavefront_sgpr_count = 0 +; MESA-GFX7-NEXT: workitem_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_vgpr_first = 0 +; MESA-GFX7-NEXT: reserved_vgpr_count = 0 +; MESA-GFX7-NEXT: reserved_sgpr_first = 0 +; MESA-GFX7-NEXT: reserved_sgpr_count = 0 +; MESA-GFX7-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX7-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX7-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX7-NEXT: group_segment_alignment = 4 +; MESA-GFX7-NEXT: private_segment_alignment = 4 +; MESA-GFX7-NEXT: wavefront_size = 6 +; MESA-GFX7-NEXT: call_convention = -1 +; MESA-GFX7-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX7-NEXT: .end_amd_kernel_code_t +; MESA-GFX7-NEXT: ; %bb.0: ; %entry +; MESA-GFX7-NEXT: s_endpgm +; +; MESA-GFX10-WGP-LABEL: wavefront_one_as_seq_cst: +; MESA-GFX10-WGP: .amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: amd_code_version_major = 1 +; MESA-GFX10-WGP-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-WGP-NEXT: amd_machine_kind = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-WGP-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-WGP-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-WGP-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-WGP-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-WGP-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: priority = 0 +; MESA-GFX10-WGP-NEXT: float_mode = 240 +; MESA-GFX10-WGP-NEXT: priv = 0 +; MESA-GFX10-WGP-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-WGP-NEXT: debug_mode = 0 +; MESA-GFX10-WGP-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_wgp_mode = 1 +; MESA-GFX10-WGP-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-WGP-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-WGP-NEXT: user_sgpr_count = 4 +; MESA-GFX10-WGP-NEXT: enable_trap_handler = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-WGP-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-WGP-NEXT: enable_exception_msb = 0 +; MESA-GFX10-WGP-NEXT: granulated_lds_size = 0 +; MESA-GFX10-WGP-NEXT: enable_exception = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-WGP-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-WGP-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-WGP-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-WGP-NEXT: private_element_size = 1 +; MESA-GFX10-WGP-NEXT: is_ptr64 = 1 +; MESA-GFX10-WGP-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-WGP-NEXT: is_debug_enabled = 0 +; MESA-GFX10-WGP-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-WGP-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-WGP-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-WGP-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-WGP-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-WGP-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-WGP-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-WGP-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-WGP-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: group_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: private_segment_alignment = 4 +; MESA-GFX10-WGP-NEXT: wavefront_size = 5 +; MESA-GFX10-WGP-NEXT: call_convention = -1 +; MESA-GFX10-WGP-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-WGP-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-WGP-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-WGP-NEXT: s_endpgm +; +; MESA-GFX10-CU-LABEL: wavefront_one_as_seq_cst: +; MESA-GFX10-CU: .amd_kernel_code_t +; MESA-GFX10-CU-NEXT: amd_code_version_major = 1 +; MESA-GFX10-CU-NEXT: amd_code_version_minor = 2 +; MESA-GFX10-CU-NEXT: amd_machine_kind = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_major = 10 +; MESA-GFX10-CU-NEXT: amd_machine_version_minor = 1 +; MESA-GFX10-CU-NEXT: amd_machine_version_stepping = 0 +; MESA-GFX10-CU-NEXT: kernel_code_entry_byte_offset = 256 +; MESA-GFX10-CU-NEXT: kernel_code_prefetch_byte_size = 0 +; MESA-GFX10-CU-NEXT: granulated_workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: granulated_wavefront_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: priority = 0 +; MESA-GFX10-CU-NEXT: float_mode = 240 +; MESA-GFX10-CU-NEXT: priv = 0 +; MESA-GFX10-CU-NEXT: enable_dx10_clamp = 1 +; MESA-GFX10-CU-NEXT: debug_mode = 0 +; MESA-GFX10-CU-NEXT: enable_ieee_mode = 1 +; MESA-GFX10-CU-NEXT: enable_wgp_mode = 0 +; MESA-GFX10-CU-NEXT: enable_mem_ordered = 1 +; MESA-GFX10-CU-NEXT: enable_fwd_progress = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0 +; MESA-GFX10-CU-NEXT: user_sgpr_count = 4 +; MESA-GFX10-CU-NEXT: enable_trap_handler = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_x = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_id_z = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_workgroup_info = 0 +; MESA-GFX10-CU-NEXT: enable_vgpr_workitem_id = 0 +; MESA-GFX10-CU-NEXT: enable_exception_msb = 0 +; MESA-GFX10-CU-NEXT: granulated_lds_size = 0 +; MESA-GFX10-CU-NEXT: enable_exception = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_buffer = 1 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_queue_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_kernarg_segment_ptr = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_dispatch_id = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_flat_scratch_init = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_private_segment_size = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_x = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_y = 0 +; MESA-GFX10-CU-NEXT: enable_sgpr_grid_workgroup_count_z = 0 +; MESA-GFX10-CU-NEXT: enable_wavefront_size32 = 1 +; MESA-GFX10-CU-NEXT: enable_ordered_append_gds = 0 +; MESA-GFX10-CU-NEXT: private_element_size = 1 +; MESA-GFX10-CU-NEXT: is_ptr64 = 1 +; MESA-GFX10-CU-NEXT: is_dynamic_callstack = 0 +; MESA-GFX10-CU-NEXT: is_debug_enabled = 0 +; MESA-GFX10-CU-NEXT: is_xnack_enabled = 0 +; MESA-GFX10-CU-NEXT: workitem_private_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: workgroup_group_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: gds_segment_byte_size = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_byte_size = 16 +; MESA-GFX10-CU-NEXT: workgroup_fbarrier_count = 0 +; MESA-GFX10-CU-NEXT: wavefront_sgpr_count = 2 +; MESA-GFX10-CU-NEXT: workitem_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_vgpr_count = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_first = 0 +; MESA-GFX10-CU-NEXT: reserved_sgpr_count = 0 +; MESA-GFX10-CU-NEXT: debug_wavefront_private_segment_offset_sgpr = 0 +; MESA-GFX10-CU-NEXT: debug_private_segment_buffer_sgpr = 0 +; MESA-GFX10-CU-NEXT: kernarg_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: group_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: private_segment_alignment = 4 +; MESA-GFX10-CU-NEXT: wavefront_size = 5 +; MESA-GFX10-CU-NEXT: call_convention = -1 +; MESA-GFX10-CU-NEXT: runtime_loader_kernel_symbol = 0 +; MESA-GFX10-CU-NEXT: .end_amd_kernel_code_t +; MESA-GFX10-CU-NEXT: ; %bb.0: ; %entry +; MESA-GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; MESA-GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") seq_cst + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-gfx6.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-gfx6.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-gfx6.ll @@ -0,0 +1,14589 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s + +; This is a reduced set of tests from memory-legalizer.ll that are applicable +; to GFX6, which does not support FLAT instructions. + +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @private_nontemporal_load_0( +; GFX6-LABEL: private_nontemporal_load_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX6-NEXT: s_add_u32 s8, s8, s3 +; GFX6-NEXT: s_addc_u32 s9, s9, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(5)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @private_nontemporal_load_1( +; GFX6-LABEL: private_nontemporal_load_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s10, -1 +; GFX6-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX6-NEXT: s_add_u32 s8, s8, s3 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_addc_u32 s9, s9, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(5)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid + %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_nontemporal_load_0( +; GFX6-LABEL: global_nontemporal_load_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_nontemporal_load_1( +; GFX6-LABEL: global_nontemporal_load_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s6 +; GFX6-NEXT: s_mov_b32 s1, s7 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc slc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @local_nontemporal_load_0( +; GFX6-LABEL: local_nontemporal_load_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @local_nontemporal_load_1( +; GFX6-LABEL: local_nontemporal_load_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid + %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @private_nontemporal_store_0( +; GFX6-LABEL: private_nontemporal_store_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX6-NEXT: s_add_u32 s4, s4, s3 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen glc slc +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(5)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4 + store i32 %val, i32 addrspace(5)* %out, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @private_nontemporal_store_1( +; GFX6-LABEL: private_nontemporal_store_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX6-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX6-NEXT: s_add_u32 s4, s4, s3 +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_addc_u32 s5, s5, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen glc slc +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(5)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(1)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid + store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @global_nontemporal_store_0( +; GFX6-LABEL: global_nontemporal_store_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_mov_b32 s4, s2 +; GFX6-NEXT: s_mov_b32 s5, s3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 glc slc +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4 + store i32 %val, i32 addrspace(1)* %out, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @global_nontemporal_store_1( +; GFX6-LABEL: global_nontemporal_store_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 glc slc +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(1)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @local_nontemporal_store_0( +; GFX6-LABEL: local_nontemporal_store_0: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(3)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4 + store i32 %val, i32 addrspace(3)* %out, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @local_nontemporal_store_1( +; GFX6-LABEL: local_nontemporal_store_1: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(3)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(1)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid + store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @singlethread_acquire_fence() { +; GFX6-LABEL: singlethread_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("singlethread") acquire + ret void +} + +define amdgpu_kernel void @singlethread_release_fence() { +; GFX6-LABEL: singlethread_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("singlethread") release + ret void +} + +define amdgpu_kernel void @singlethread_acq_rel_fence() { +; GFX6-LABEL: singlethread_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("singlethread") acq_rel + ret void +} + +define amdgpu_kernel void @singlethread_seq_cst_fence() { +; GFX6-LABEL: singlethread_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("singlethread") seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_unordered_load( +; GFX6-LABEL: global_singlethread_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_monotonic_load( +; GFX6-LABEL: global_singlethread_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_load( +; GFX6-LABEL: global_singlethread_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_load( +; GFX6-LABEL: global_singlethread_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_unordered_store( +; GFX6-LABEL: global_singlethread_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_monotonic_store( +; GFX6-LABEL: global_singlethread_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_release_store( +; GFX6-LABEL: global_singlethread_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") release, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_store( +; GFX6-LABEL: global_singlethread_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( +; GFX6-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( +; GFX6-LABEL: global_singlethread_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_release_atomicrmw( +; GFX6-LABEL: global_singlethread_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") release + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( +; GFX6-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( +; GFX6-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( +; GFX6-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_unordered_load( +; GFX6-LABEL: local_singlethread_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_monotonic_load( +; GFX6-LABEL: local_singlethread_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_load( +; GFX6-LABEL: local_singlethread_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_load( +; GFX6-LABEL: local_singlethread_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_unordered_store( +; GFX6-LABEL: local_singlethread_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_monotonic_store( +; GFX6-LABEL: local_singlethread_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_release_store( +; GFX6-LABEL: local_singlethread_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") release, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_store( +; GFX6-LABEL: local_singlethread_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( +; GFX6-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( +; GFX6-LABEL: local_singlethread_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_release_atomicrmw( +; GFX6-LABEL: local_singlethread_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") release + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( +; GFX6-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( +; GFX6-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( +; GFX6-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @singlethread_one_as_acquire_fence() { +; GFX6-LABEL: singlethread_one_as_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @singlethread_one_as_release_fence() { +; GFX6-LABEL: singlethread_one_as_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { +; GFX6-LABEL: singlethread_one_as_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { +; GFX6-LABEL: singlethread_one_as_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_unordered_load( +; GFX6-LABEL: global_singlethread_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( +; GFX6-LABEL: global_singlethread_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_load( +; GFX6-LABEL: global_singlethread_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_unordered_store( +; GFX6-LABEL: global_singlethread_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( +; GFX6-LABEL: global_singlethread_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_store( +; GFX6-LABEL: global_singlethread_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_unordered_load( +; GFX6-LABEL: local_singlethread_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( +; GFX6-LABEL: local_singlethread_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_load( +; GFX6-LABEL: local_singlethread_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_unordered_store( +; GFX6-LABEL: local_singlethread_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( +; GFX6-LABEL: local_singlethread_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_store( +; GFX6-LABEL: local_singlethread_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @wavefront_acquire_fence() { +; GFX6-LABEL: wavefront_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("wavefront") acquire + ret void +} + +define amdgpu_kernel void @wavefront_release_fence() { +; GFX6-LABEL: wavefront_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("wavefront") release + ret void +} + +define amdgpu_kernel void @wavefront_acq_rel_fence() { +; GFX6-LABEL: wavefront_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("wavefront") acq_rel + ret void +} + +define amdgpu_kernel void @wavefront_seq_cst_fence() { +; GFX6-LABEL: wavefront_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("wavefront") seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_unordered_load( +; GFX6-LABEL: global_wavefront_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_monotonic_load( +; GFX6-LABEL: global_wavefront_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_load( +; GFX6-LABEL: global_wavefront_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_load( +; GFX6-LABEL: global_wavefront_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_unordered_store( +; GFX6-LABEL: global_wavefront_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_monotonic_store( +; GFX6-LABEL: global_wavefront_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_release_store( +; GFX6-LABEL: global_wavefront_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") release, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_store( +; GFX6-LABEL: global_wavefront_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( +; GFX6-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( +; GFX6-LABEL: global_wavefront_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_release_atomicrmw( +; GFX6-LABEL: global_wavefront_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") release + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( +; GFX6-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( +; GFX6-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( +; GFX6-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_unordered_load( +; GFX6-LABEL: local_wavefront_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_monotonic_load( +; GFX6-LABEL: local_wavefront_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_load( +; GFX6-LABEL: local_wavefront_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_load( +; GFX6-LABEL: local_wavefront_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_unordered_store( +; GFX6-LABEL: local_wavefront_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_monotonic_store( +; GFX6-LABEL: local_wavefront_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_release_store( +; GFX6-LABEL: local_wavefront_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") release, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_store( +; GFX6-LABEL: local_wavefront_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( +; GFX6-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( +; GFX6-LABEL: local_wavefront_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_release_atomicrmw( +; GFX6-LABEL: local_wavefront_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") release + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( +; GFX6-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( +; GFX6-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( +; GFX6-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @wavefront_one_as_acquire_fence() { +; GFX6-LABEL: wavefront_one_as_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @wavefront_one_as_release_fence() { +; GFX6-LABEL: wavefront_one_as_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { +; GFX6-LABEL: wavefront_one_as_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { +; GFX6-LABEL: wavefront_one_as_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_unordered_load( +; GFX6-LABEL: global_wavefront_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( +; GFX6-LABEL: global_wavefront_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_load( +; GFX6-LABEL: global_wavefront_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_unordered_store( +; GFX6-LABEL: global_wavefront_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( +; GFX6-LABEL: global_wavefront_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_store( +; GFX6-LABEL: global_wavefront_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_unordered_load( +; GFX6-LABEL: local_wavefront_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( +; GFX6-LABEL: local_wavefront_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_load( +; GFX6-LABEL: local_wavefront_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_unordered_store( +; GFX6-LABEL: local_wavefront_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( +; GFX6-LABEL: local_wavefront_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_store( +; GFX6-LABEL: local_wavefront_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @workgroup_acquire_fence() { +; GFX6-LABEL: workgroup_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("workgroup") acquire + ret void +} + +define amdgpu_kernel void @workgroup_release_fence() { +; GFX6-LABEL: workgroup_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("workgroup") release + ret void +} + +define amdgpu_kernel void @workgroup_acq_rel_fence() { +; GFX6-LABEL: workgroup_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @workgroup_seq_cst_fence() { +; GFX6-LABEL: workgroup_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_unordered_load( +; GFX6-LABEL: global_workgroup_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_monotonic_load( +; GFX6-LABEL: global_workgroup_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_load( +; GFX6-LABEL: global_workgroup_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_load( +; GFX6-LABEL: global_workgroup_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_unordered_store( +; GFX6-LABEL: global_workgroup_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_monotonic_store( +; GFX6-LABEL: global_workgroup_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_release_store( +; GFX6-LABEL: global_workgroup_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_store( +; GFX6-LABEL: global_workgroup_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( +; GFX6-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( +; GFX6-LABEL: global_workgroup_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_release_atomicrmw( +; GFX6-LABEL: global_workgroup_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") release + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( +; GFX6-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( +; GFX6-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( +; GFX6-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_unordered_load( +; GFX6-LABEL: local_workgroup_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_monotonic_load( +; GFX6-LABEL: local_workgroup_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_load( +; GFX6-LABEL: local_workgroup_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_load( +; GFX6-LABEL: local_workgroup_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_unordered_store( +; GFX6-LABEL: local_workgroup_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_monotonic_store( +; GFX6-LABEL: local_workgroup_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_release_store( +; GFX6-LABEL: local_workgroup_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_store( +; GFX6-LABEL: local_workgroup_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( +; GFX6-LABEL: local_workgroup_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( +; GFX6-LABEL: local_workgroup_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_release_atomicrmw( +; GFX6-LABEL: local_workgroup_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") release + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( +; GFX6-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( +; GFX6-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( +; GFX6-LABEL: local_workgroup_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @workgroup_one_as_acquire_fence() { +; GFX6-LABEL: workgroup_one_as_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @workgroup_one_as_release_fence() { +; GFX6-LABEL: workgroup_one_as_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { +; GFX6-LABEL: workgroup_one_as_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { +; GFX6-LABEL: workgroup_one_as_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_unordered_load( +; GFX6-LABEL: global_workgroup_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( +; GFX6-LABEL: global_workgroup_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_load( +; GFX6-LABEL: global_workgroup_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_unordered_store( +; GFX6-LABEL: global_workgroup_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( +; GFX6-LABEL: global_workgroup_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_store( +; GFX6-LABEL: global_workgroup_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_unordered_load( +; GFX6-LABEL: local_workgroup_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( +; GFX6-LABEL: local_workgroup_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_load( +; GFX6-LABEL: local_workgroup_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_unordered_store( +; GFX6-LABEL: local_workgroup_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( +; GFX6-LABEL: local_workgroup_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_store( +; GFX6-LABEL: local_workgroup_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @agent_acquire_fence() { +; GFX6-LABEL: agent_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("agent") acquire + ret void +} + +define amdgpu_kernel void @agent_release_fence() { +; GFX6-LABEL: agent_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("agent") release + ret void +} + +define amdgpu_kernel void @agent_acq_rel_fence() { +; GFX6-LABEL: agent_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("agent") acq_rel + ret void +} + +define amdgpu_kernel void @agent_seq_cst_fence() { +; GFX6-LABEL: agent_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_unordered_load( +; GFX6-LABEL: global_agent_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_monotonic_load( +; GFX6-LABEL: global_agent_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_acquire_load( +; GFX6-LABEL: global_agent_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_load( +; GFX6-LABEL: global_agent_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_unordered_store( +; GFX6-LABEL: global_agent_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_monotonic_store( +; GFX6-LABEL: global_agent_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_release_store( +; GFX6-LABEL: global_agent_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") release, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_store( +; GFX6-LABEL: global_agent_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_monotonic_atomicrmw( +; GFX6-LABEL: global_agent_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_agent_acquire_atomicrmw( +; GFX6-LABEL: global_agent_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire + ret void +} + +define amdgpu_kernel void @global_agent_release_atomicrmw( +; GFX6-LABEL: global_agent_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") release + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( +; GFX6-LABEL: global_agent_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( +; GFX6-LABEL: global_agent_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( +; GFX6-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release monotonic + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + ret void +} + +define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( +; GFX6-LABEL: global_agent_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_unordered_load( +; GFX6-LABEL: local_agent_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_monotonic_load( +; GFX6-LABEL: local_agent_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_acquire_load( +; GFX6-LABEL: local_agent_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_load( +; GFX6-LABEL: local_agent_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_unordered_store( +; GFX6-LABEL: local_agent_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_monotonic_store( +; GFX6-LABEL: local_agent_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_release_store( +; GFX6-LABEL: local_agent_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") release, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_store( +; GFX6-LABEL: local_agent_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_monotonic_atomicrmw( +; GFX6-LABEL: local_agent_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @local_agent_acquire_atomicrmw( +; GFX6-LABEL: local_agent_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire + ret void +} + +define amdgpu_kernel void @local_agent_release_atomicrmw( +; GFX6-LABEL: local_agent_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") release + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( +; GFX6-LABEL: local_agent_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( +; GFX6-LABEL: local_agent_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( +; GFX6-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release monotonic + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + ret void +} + +define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( +; GFX6-LABEL: local_agent_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @agent_one_as_acquire_fence() { +; GFX6-LABEL: agent_one_as_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @agent_one_as_release_fence() { +; GFX6-LABEL: agent_one_as_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @agent_one_as_acq_rel_fence() { +; GFX6-LABEL: agent_one_as_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @agent_one_as_seq_cst_fence() { +; GFX6-LABEL: agent_one_as_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_one_as_unordered_load( +; GFX6-LABEL: global_agent_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_load( +; GFX6-LABEL: global_agent_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_load( +; GFX6-LABEL: global_agent_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_load( +; GFX6-LABEL: global_agent_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_unordered_store( +; GFX6-LABEL: global_agent_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_store( +; GFX6-LABEL: global_agent_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_store( +; GFX6-LABEL: global_agent_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_store( +; GFX6-LABEL: global_agent_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( +; GFX6-LABEL: global_agent_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_unordered_load( +; GFX6-LABEL: local_agent_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_load( +; GFX6-LABEL: local_agent_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_load( +; GFX6-LABEL: local_agent_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_load( +; GFX6-LABEL: local_agent_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_unordered_store( +; GFX6-LABEL: local_agent_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_store( +; GFX6-LABEL: local_agent_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_store( +; GFX6-LABEL: local_agent_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_store( +; GFX6-LABEL: local_agent_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( +; GFX6-LABEL: local_agent_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @system_acquire_fence() { +; GFX6-LABEL: system_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence acquire + ret void +} + +define amdgpu_kernel void @system_release_fence() { +; GFX6-LABEL: system_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_endpgm +entry: + fence release + ret void +} + +define amdgpu_kernel void @system_acq_rel_fence() { +; GFX6-LABEL: system_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence acq_rel + ret void +} + +define amdgpu_kernel void @system_seq_cst_fence() { +; GFX6-LABEL: system_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence seq_cst + ret void +} + +define amdgpu_kernel void @global_system_unordered_load( +; GFX6-LABEL: global_system_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_monotonic_load( +; GFX6-LABEL: global_system_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_acquire_load( +; GFX6-LABEL: global_system_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_load( +; GFX6-LABEL: global_system_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_unordered_store( +; GFX6-LABEL: global_system_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_system_monotonic_store( +; GFX6-LABEL: global_system_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_system_release_store( +; GFX6-LABEL: global_system_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out release, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_store( +; GFX6-LABEL: global_system_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_system_monotonic_atomicrmw( +; GFX6-LABEL: global_system_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in monotonic + ret void +} + +define amdgpu_kernel void @global_system_acquire_atomicrmw( +; GFX6-LABEL: global_system_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire + ret void +} + +define amdgpu_kernel void @global_system_release_atomicrmw( +; GFX6-LABEL: global_system_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in release + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_atomicrmw( +; GFX6-LABEL: global_system_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_atomicrmw( +; GFX6-LABEL: global_system_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( +; GFX6-LABEL: global_system_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire monotonic + ret void +} + +define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( +; GFX6-LABEL: global_system_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release monotonic + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire acquire + ret void +} + +define amdgpu_kernel void @global_system_release_acquire_cmpxchg( +; GFX6-LABEL: global_system_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release acquire + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_unordered_load( +; GFX6-LABEL: local_system_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_monotonic_load( +; GFX6-LABEL: local_system_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_acquire_load( +; GFX6-LABEL: local_system_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_load( +; GFX6-LABEL: local_system_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_unordered_store( +; GFX6-LABEL: local_system_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_system_monotonic_store( +; GFX6-LABEL: local_system_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_system_release_store( +; GFX6-LABEL: local_system_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out release, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_store( +; GFX6-LABEL: local_system_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_system_monotonic_atomicrmw( +; GFX6-LABEL: local_system_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in monotonic + ret void +} + +define amdgpu_kernel void @local_system_acquire_atomicrmw( +; GFX6-LABEL: local_system_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire + ret void +} + +define amdgpu_kernel void @local_system_release_atomicrmw( +; GFX6-LABEL: local_system_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in release + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_atomicrmw( +; GFX6-LABEL: local_system_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_atomicrmw( +; GFX6-LABEL: local_system_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( +; GFX6-LABEL: local_system_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire monotonic + ret void +} + +define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( +; GFX6-LABEL: local_system_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release monotonic + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire acquire + ret void +} + +define amdgpu_kernel void @local_system_release_acquire_cmpxchg( +; GFX6-LABEL: local_system_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release acquire + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @system_one_as_acquire_fence() { +; GFX6-LABEL: system_one_as_acquire_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @system_one_as_release_fence() { +; GFX6-LABEL: system_one_as_release_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @system_one_as_acq_rel_fence() { +; GFX6-LABEL: system_one_as_acq_rel_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @system_one_as_seq_cst_fence() { +; GFX6-LABEL: system_one_as_seq_cst_fence: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm +entry: + fence syncscope("one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_system_one_as_unordered_load( +; GFX6-LABEL: global_system_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_load( +; GFX6-LABEL: global_system_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_load( +; GFX6-LABEL: global_system_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_load( +; GFX6-LABEL: global_system_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s0, s4 +; GFX6-NEXT: s_mov_b32 s1, s5 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_mov_b32 s4, s6 +; GFX6-NEXT: s_mov_b32 s5, s7 +; GFX6-NEXT: s_mov_b32 s6, s2 +; GFX6-NEXT: s_mov_b32 s7, s3 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_unordered_store( +; GFX6-LABEL: global_system_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_store( +; GFX6-LABEL: global_system_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_store( +; GFX6-LABEL: global_system_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_store( +; GFX6-LABEL: global_system_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s4 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( +; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( +; GFX6-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_atomicrmw( +; GFX6-LABEL: global_system_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc +; GFX6-NEXT: s_waitcnt vmcnt(0) +; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_unordered_load( +; GFX6-LABEL: local_system_one_as_unordered_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_load( +; GFX6-LABEL: local_system_one_as_monotonic_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_load( +; GFX6-LABEL: local_system_one_as_acquire_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_load( +; GFX6-LABEL: local_system_one_as_seq_cst_load: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v1, v0 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_unordered_store( +; GFX6-LABEL: local_system_one_as_unordered_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_store( +; GFX6-LABEL: local_system_one_as_monotonic_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_store( +; GFX6-LABEL: local_system_one_as_release_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_store( +; GFX6-LABEL: local_system_one_as_seq_cst_store: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( +; GFX6-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( +; GFX6-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_atomicrmw( +; GFX6-LABEL: local_system_one_as_release_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( +; GFX6-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( +; GFX6-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( +; GFX6-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( +; GFX6-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( +; GFX6-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s0 +; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( +; GFX6-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( +; GFX6-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( +; GFX6-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( +; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( +; GFX6-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( +; GFX6-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( +; GFX6-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX6-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9 +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xa +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: v_mov_b32_e32 v1, s1 +; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_write_b32 v0, v1 +; GFX6-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir @@ -1,8 +1,10 @@ -# RUN: not llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - 2>&1 | FileCheck -check-prefix=GCN %s +# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=si-memory-legalizer -o - %s 2>&1 | FileCheck %s +# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=si-memory-legalizer -o - %s 2>&1 | FileCheck %s +# RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -run-pass=si-memory-legalizer -o - %s 2>&1 | FileCheck %s --- -# GCN: error: :0:0: in function invalid_load void (): Unsupported atomic address space +# CHECK: error: :0:0: in function invalid_load void (): Unsupported atomic address space name: invalid_load body: | @@ -20,7 +22,7 @@ ... --- -# GCN: error: :0:0: in function invalid_store void (): Unsupported atomic address space +# CHECK: error: :0:0: in function invalid_store void (): Unsupported atomic address space name: invalid_store body: | @@ -36,7 +38,7 @@ ... --- -# GCN: error: :0:0: in function invalid_cmpxchg void (): Unsupported atomic address space +# CHECK: error: :0:0: in function invalid_cmpxchg void (): Unsupported atomic address space name: invalid_cmpxchg body: | @@ -53,7 +55,7 @@ ... --- -# GCN: error: :0:0: in function invalid_rmw void (): Unsupported atomic address space +# CHECK: error: :0:0: in function invalid_rmw void (): Unsupported atomic address space name: invalid_rmw body: | diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll @@ -1,6 +1,6 @@ -; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s -; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s 2>&1 | FileCheck %s ; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s 2>&1 | FileCheck %s +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s 2>&1 | FileCheck %s ; CHECK: error: :0:0: in function invalid_fence void (): Unsupported atomic synchronization scope define amdgpu_kernel void @invalid_fence() { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ /dev/null @@ -1,1019 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s - -declare i32 @llvm.amdgcn.workitem.id.x() - -; GCN-LABEL: {{^}}system_one_as_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel system_one_as_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_unordered( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}system_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel system_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_monotonic( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}system_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel system_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_acquire( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}system_one_as_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel system_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel singlethread_one_as_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_unordered( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel singlethread_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_monotonic( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel singlethread_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_acquire( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_seq_cst( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel agent_one_as_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_unordered( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel agent_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_monotonic( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel agent_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_acquire( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GCN-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel agent_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel workgroup_one_as_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_unordered( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel workgroup_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_monotonic( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel workgroup_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_acquire( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0 -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10CU: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel wavefront_one_as_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_unordered( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel wavefront_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_monotonic( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel wavefront_one_as_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_acquire( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_seq_cst( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}nontemporal_private_0: -; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} -; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_private_0 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_private_0( - i32 addrspace(5)* %in, i32* %out) { -entry: - %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}nontemporal_private_1: -; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} -; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_private_1 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_private_1( - i32 addrspace(5)* %in, i32* %out) { -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid - %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}nontemporal_global_0: -; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}} -; GFX10: .amdhsa_kernel nontemporal_global_0 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_global_0( - i32 addrspace(1)* %in, i32* %out) { -entry: - %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}nontemporal_global_1: -; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX10: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_global_1 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_global_1( - i32 addrspace(1)* %in, i32* %out) { -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid - %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}nontemporal_local_0: -; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel nontemporal_local_0 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_local_0( - i32 addrspace(3)* %in, i32* %out) { -entry: - %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}nontemporal_local_1: -; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel nontemporal_local_1 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_local_1( - i32 addrspace(3)* %in, i32* %out) { -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid - %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}nontemporal_flat_0: -; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_flat_0 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_flat_0( - i32* %in, i32* %out) { -entry: - %val = load i32, i32* %in, align 4, !nontemporal !0 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}nontemporal_flat_1: -; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_flat_1 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_flat_1( - i32* %in, i32* %out) { -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid - %val = load i32, i32* %val.gep, align 4, !nontemporal !0 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}system_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel system_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_unordered( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in unordered, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}system_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel system_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_monotonic( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in monotonic, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}system_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel system_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_acquire( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in acquire, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst: -; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel system_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in seq_cst, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}singlethread_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel singlethread_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_unordered( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}singlethread_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel singlethread_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_monotonic( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}singlethread_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel singlethread_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_acquire( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}singlethread_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel singlethread_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_seq_cst( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}agent_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel agent_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_unordered( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}agent_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel agent_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_monotonic( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}agent_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel agent_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_acquire( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}agent_seq_cst: -; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}} -; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89-NEXT: buffer_wbinvl1_vol -; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: buffer_gl1_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel agent_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}workgroup_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel workgroup_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_unordered( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}workgroup_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel workgroup_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_monotonic( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}workgroup_acquire: -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX89: s_waitcnt lgkmcnt(0){{$}} -; GFX89: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel workgroup_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_acquire( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}workgroup_seq_cst: -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0 -; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}} -; GFX10CU: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GFX89: s_waitcnt lgkmcnt(0){{$}} -; GFX89: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10WGP-NEXT: buffer_gl0_inv -; GFX10CU-NOT: buffer_gl0_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel workgroup_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}wavefront_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel wavefront_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_unordered( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}wavefront_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel wavefront_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_monotonic( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}wavefront_acquire: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel wavefront_acquire -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_acquire( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 - store i32 %val, i32* %out - ret void -} - -; GCN-LABEL: {{^}}wavefront_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}} -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GFX89-NOT: buffer_wbinvl1_vol -; GFX10-NOT: buffer_gl{{[01]}}_inv -; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] -; GFX10: .amdhsa_kernel wavefront_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_seq_cst( - i32* %in, i32* %out) { -entry: - %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 - store i32 %val, i32* %out - ret void -} - -!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local.mir +++ /dev/null @@ -1,1054 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s - ---- - -# GCN-LABEL: name: load_singlethread_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_singlethread_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_singlethread_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_singlethread_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_singlethread_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_singlethread_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_singlethread_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_singlethread_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_wavefront_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_wavefront_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_wavefront_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_wavefront_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_wavefront_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_wavefront_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_wavefront_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_wavefront_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_workgroup_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_workgroup_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_workgroup_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_workgroup_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_workgroup_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_workgroup_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_workgroup_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_workgroup_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_agent_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_agent_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_agent_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_agent_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_agent_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_agent_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_agent_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_agent_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_system_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_system_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_system_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_system_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_system_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_system_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_system_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_system_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(3)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_singlethread_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_singlethread_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_singlethread_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_singlethread_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_singlethread_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_singlethread_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_singlethread_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_singlethread_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_wavefront_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_wavefront_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_wavefront_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_wavefront_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_wavefront_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_wavefront_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_wavefront_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_wavefront_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_workgroup_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_workgroup_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_workgroup_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_workgroup_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_workgroup_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_workgroup_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_workgroup_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_workgroup_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_agent_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_agent_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_agent_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_agent_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_agent_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_agent_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_agent_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_agent_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_system_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_system_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") unordered 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_system_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_system_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") monotonic 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_system_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_system_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_system_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_system_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_acq_rel - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_acq_rel -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) - S_ENDPGM 0 - -... diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-mesa3d.ll +++ /dev/null @@ -1,440 +0,0 @@ -; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s -; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s -; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s -; RUN: llc -mtriple=amdgcn--mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN %s - -; FUNC-LABEL: {{^}}system_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @system_one_as_acquire() { -entry: - fence syncscope("one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}system_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN: s_endpgm -define amdgpu_kernel void @system_one_as_release() { -entry: - fence syncscope("one-as") release - ret void -} - -; FUNC-LABEL: {{^}}system_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @system_one_as_acq_rel() { -entry: - fence syncscope("one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}system_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @system_one_as_seq_cst() { -entry: - fence syncscope("one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_one_as_acquire() { -entry: - fence syncscope("singlethread-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_one_as_release() { -entry: - fence syncscope("singlethread-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_one_as_acq_rel() { -entry: - fence syncscope("singlethread-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}singlethread_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_one_as_seq_cst() { -entry: - fence syncscope("singlethread-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN-NEXT: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @agent_one_as_acquire() { -entry: - fence syncscope("agent-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN: s_endpgm -define amdgpu_kernel void @agent_one_as_release() { -entry: - fence syncscope("agent-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @agent_one_as_acq_rel() { -entry: - fence syncscope("agent-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}agent_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0){{$}} -; GCN: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @agent_one_as_seq_cst() { -entry: - fence syncscope("agent-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_one_as_acquire() { -entry: - fence syncscope("workgroup-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_release: -; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_one_as_release() { -entry: - fence syncscope("workgroup-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_one_as_acq_rel() { -entry: - fence syncscope("workgroup-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}workgroup_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_one_as_seq_cst() { -entry: - fence syncscope("workgroup-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_one_as_acquire() { -entry: - fence syncscope("wavefront-one-as") acquire - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_one_as_release() { -entry: - fence syncscope("wavefront-one-as") release - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_one_as_acq_rel() { -entry: - fence syncscope("wavefront-one-as") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}wavefront_one_as_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_one_as_seq_cst() { -entry: - fence syncscope("wavefront-one-as") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}system_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @system_acquire() { -entry: - fence acquire - ret void -} - -; FUNC-LABEL: {{^}}system_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: s_endpgm -define amdgpu_kernel void @system_release() { -entry: - fence release - ret void -} - -; FUNC-LABEL: {{^}}system_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @system_acq_rel() { -entry: - fence acq_rel - ret void -} - -; FUNC-LABEL: {{^}}system_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @system_seq_cst() { -entry: - fence seq_cst - ret void -} - -; FUNC-LABEL: {{^}}singlethread_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_acquire() { -entry: - fence syncscope("singlethread") acquire - ret void -} - -; FUNC-LABEL: {{^}}singlethread_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_release() { -entry: - fence syncscope("singlethread") release - ret void -} - -; FUNC-LABEL: {{^}}singlethread_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_acq_rel() { -entry: - fence syncscope("singlethread") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}singlethread_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @singlethread_seq_cst() { -entry: - fence syncscope("singlethread") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}agent_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NEXT: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @agent_acquire() { -entry: - fence syncscope("agent") acquire - ret void -} - -; FUNC-LABEL: {{^}}agent_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: s_endpgm -define amdgpu_kernel void @agent_release() { -entry: - fence syncscope("agent") release - ret void -} - -; FUNC-LABEL: {{^}}agent_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @agent_acq_rel() { -entry: - fence syncscope("agent") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}agent_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN: buffer_wbinvl1{{$}} -; GCN: s_endpgm -define amdgpu_kernel void @agent_seq_cst() { -entry: - fence syncscope("agent") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}workgroup_acquire: -; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_acquire() { -entry: - fence syncscope("workgroup") acquire - ret void -} - -; FUNC-LABEL: {{^}}workgroup_release: -; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_release() { -entry: - fence syncscope("workgroup") release - ret void -} - -; FUNC-LABEL: {{^}}workgroup_acq_rel: -; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_acq_rel() { -entry: - fence syncscope("workgroup") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}workgroup_seq_cst: -; GCN: %bb.0 -; GCN-NOT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @workgroup_seq_cst() { -entry: - fence syncscope("workgroup") seq_cst - ret void -} - -; FUNC-LABEL: {{^}}wavefront_acquire: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_acquire() { -entry: - fence syncscope("wavefront") acquire - ret void -} - -; FUNC-LABEL: {{^}}wavefront_release: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_release() { -entry: - fence syncscope("wavefront") release - ret void -} - -; FUNC-LABEL: {{^}}wavefront_acq_rel: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_acq_rel() { -entry: - fence syncscope("wavefront") acq_rel - ret void -} - -; FUNC-LABEL: {{^}}wavefront_seq_cst: -; GCN: %bb.0 -; GCN-NOT: ATOMIC_FENCE -; GCN: s_endpgm -define amdgpu_kernel void @wavefront_seq_cst() { -entry: - fence syncscope("wavefront") seq_cst - ret void -} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir +++ /dev/null @@ -1,65 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s - ---- - -# GCN-LABEL: name: multiple_mem_operands - -# GCN-LABEL: bb.3: -# GCN: S_WAITCNT 3952 -# GCN-NEXT: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: S_WAITCNT 3952 -# GCN-NEXT: BUFFER_WBINVL1_VOL - -name: multiple_mem_operands -body: | - bb.0.entry: - successors: %bb.1(0x30000000), %bb.2(0x50000000) - liveins: $sgpr0_sgpr1, $sgpr3 - - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) - $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) - $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) - S_WAITCNT 127 - S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc - S_WAITCNT 3855 - $vgpr0 = V_MOV_B32_e32 2, implicit $exec - $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) - S_CBRANCH_SCC0 %bb.1, implicit killed $scc - - bb.2: - successors: %bb.3(0x80000000) - liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 - - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) - S_WAITCNT 3855 - $vgpr0 = V_MOV_B32_e32 32772, implicit $exec - S_BRANCH %bb.3 - - bb.1: - successors: %bb.3(0x80000000) - liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 - - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) - S_WAITCNT 3855 - $vgpr0 = V_MOV_B32_e32 4, implicit $exec - - bb.3: - liveins: $sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $sgpr0 - - S_WAITCNT 127 - $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc - $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) - $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 - $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec - S_WAITCNT 3952 - FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`) - S_ENDPGM 0 - -... diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir +++ /dev/null @@ -1,159 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck %s - ---- | - define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 { - entry: - %scratch0 = alloca [8192 x i32], addrspace(5) - %scratch1 = alloca [8192 x i32], addrspace(5) - %scratchptr01 = bitcast [8192 x i32] addrspace(5)* %scratch0 to i32 addrspace(5)* - store i32 1, i32 addrspace(5)* %scratchptr01 - %scratchptr12 = bitcast [8192 x i32] addrspace(5)* %scratch1 to i32 addrspace(5)* - store i32 2, i32 addrspace(5)* %scratchptr12 - %cmp = icmp eq i32 %cond, 0 - br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 - - if: ; preds = %entry - %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0 - %if_value = load i32, i32 addrspace(5)* %if_ptr, align 4, !nontemporal !1 - br label %done, !structurizecfg.uniform !0 - - else: ; preds = %entry - %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0 - %else_value = load i32, i32 addrspace(5)* %else_ptr, align 4, !nontemporal !1 - br label %done, !structurizecfg.uniform !0 - - done: ; preds = %else, %if - %value = phi i32 [ %if_value, %if ], [ %else_value, %else ] - store i32 %value, i32 addrspace(1)* %out - ret void - } - - ; Function Attrs: convergent nounwind - declare { i1, i64 } @llvm.amdgcn.if(i1) #1 - - ; Function Attrs: convergent nounwind - declare { i1, i64 } @llvm.amdgcn.else(i64) #1 - - ; Function Attrs: convergent nounwind readnone - declare i64 @llvm.amdgcn.break(i64) #2 - - ; Function Attrs: convergent nounwind readnone - declare i64 @llvm.amdgcn.if.break(i1, i64) #2 - - ; Function Attrs: convergent nounwind readnone - declare i64 @llvm.amdgcn.else.break(i64, i64) #2 - - ; Function Attrs: convergent nounwind - declare i1 @llvm.amdgcn.loop(i64) #1 - - ; Function Attrs: convergent nounwind - declare void @llvm.amdgcn.end.cf(i64) #1 - - attributes #0 = { "target-cpu"="gfx803" } - attributes #1 = { convergent nounwind } - attributes #2 = { convergent nounwind readnone } - - !0 = !{} - !1 = !{i32 1} - -... ---- - -# CHECK-LABEL: name: multiple_mem_operands - -# CHECK-LABEL: bb.3.done: -# CHECK: BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 1, 1, 0, 0 - -name: multiple_mem_operands -alignment: 1 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: -liveins: - - { reg: '$sgpr0_sgpr1', virtual-reg: '' } - - { reg: '$sgpr3', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 65540 - offsetAdjustment: 0 - maxAlignment: 4 - adjustsStack: false - hasCalls: false - stackProtector: '' - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - savePoint: '' - restorePoint: '' -fixedStack: - - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: default, - isImmutable: false, isAliased: false, callee-saved-register: '' } -stack: - - { id: 0, name: scratch0, type: default, offset: 4, size: 32768, alignment: 4, - stack-id: default, callee-saved-register: '', local-offset: 0, - debug-info-variable: '', debug-info-expression: '', - debug-info-location: '' } - - { id: 1, name: scratch1, type: default, offset: 32772, size: 32768, - alignment: 4, stack-id: default, callee-saved-register: '', local-offset: 32768, - debug-info-variable: '', debug-info-expression: '', - debug-info-location: '' } -constants: -body: | - bb.0.entry: - successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000) - liveins: $sgpr0_sgpr1, $sgpr3 - - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) - $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) - $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) - S_WAITCNT 127 - S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc - S_WAITCNT 3855 - $vgpr0 = V_MOV_B32_e32 2, implicit $exec - $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) - S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc - - bb.2.else: - successors: %bb.3.done(0x80000000) - liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 - - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) - S_WAITCNT 3855 - $vgpr0 = V_MOV_B32_e32 32772, implicit $exec - S_BRANCH %bb.3.done - - bb.1.if: - successors: %bb.3.done(0x80000000) - liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 - - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) - S_WAITCNT 3855 - $vgpr0 = V_MOV_B32_e32 4, implicit $exec - - bb.3.done: - liveins: $sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $sgpr0 - - S_WAITCNT 127 - $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc - $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) - $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 - $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec - S_WAITCNT 3952 - FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out) - S_ENDPGM 0 - -... diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir +++ /dev/null @@ -1,139 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck %s - ---- | - - define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 { - entry: - %scratch0 = alloca [8192 x i32], addrspace(5) - %scratch1 = alloca [8192 x i32], addrspace(5) - %scratchptr01 = bitcast [8192 x i32] addrspace(5)* %scratch0 to i32 addrspace(5)* - store i32 1, i32 addrspace(5)* %scratchptr01 - %scratchptr12 = bitcast [8192 x i32] addrspace(5)* %scratch1 to i32 addrspace(5)* - store i32 2, i32 addrspace(5)* %scratchptr12 - %cmp = icmp eq i32 %cond, 0 - br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0 - - if: ; preds = %entry - %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0 - %if_value = load i32, i32 addrspace(5)* %if_ptr, align 4, !nontemporal !1 - br label %done, !structurizecfg.uniform !0 - - else: ; preds = %entry - %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0 - %else_value = load i32, i32 addrspace(5)* %else_ptr, align 4 - br label %done, !structurizecfg.uniform !0 - - done: ; preds = %else, %if - %value = phi i32 [ %if_value, %if ], [ %else_value, %else ] - store i32 %value, i32 addrspace(1)* %out - ret void - } - - attributes #0 = { "target-cpu"="gfx803" } - attributes #1 = { convergent nounwind } - attributes #2 = { convergent nounwind readnone } - - !0 = !{} - !1 = !{i32 1} - -... ---- - -# CHECK-LABEL: name: multiple_mem_operands - -# CHECK-LABEL: bb.3.done: -# CHECK: BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0 - -name: multiple_mem_operands -alignment: 1 -exposesReturnsTwice: false -legalized: false -regBankSelected: false -selected: false -tracksRegLiveness: true -registers: -liveins: - - { reg: '$sgpr0_sgpr1', virtual-reg: '' } - - { reg: '$sgpr3', virtual-reg: '' } -frameInfo: - isFrameAddressTaken: false - isReturnAddressTaken: false - hasStackMap: false - hasPatchPoint: false - stackSize: 65540 - offsetAdjustment: 0 - maxAlignment: 4 - adjustsStack: false - hasCalls: false - stackProtector: '' - maxCallFrameSize: 0 - hasOpaqueSPAdjustment: false - hasVAStart: false - hasMustTailInVarArgFunc: false - savePoint: '' - restorePoint: '' -fixedStack: - - { id: 0, type: default, offset: 0, size: 4, alignment: 4, stack-id: default, - isImmutable: false, isAliased: false, callee-saved-register: '' } -stack: - - { id: 0, name: scratch0, type: default, offset: 4, size: 32768, alignment: 4, - stack-id: default, callee-saved-register: '', local-offset: 0, - debug-info-variable: '', debug-info-expression: '', - debug-info-location: '' } - - { id: 1, name: scratch1, type: default, offset: 32772, size: 32768, - alignment: 4, stack-id: default, callee-saved-register: '', local-offset: 32768, - debug-info-variable: '', debug-info-expression: '', - debug-info-location: '' } -constants: -body: | - bb.0.entry: - successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000) - liveins: $sgpr0_sgpr1, $sgpr3 - - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) - $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) - $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) - S_WAITCNT 127 - S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc - S_WAITCNT 3855 - $vgpr0 = V_MOV_B32_e32 2, implicit $exec - $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) - S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc - - bb.2.else: - successors: %bb.3.done(0x80000000) - liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 - - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) - S_WAITCNT 3855 - $vgpr0 = V_MOV_B32_e32 32772, implicit $exec - S_BRANCH %bb.3.done - - bb.1.if: - successors: %bb.3.done(0x80000000) - liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 - - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) - S_WAITCNT 3855 - $vgpr0 = V_MOV_B32_e32 4, implicit $exec - - bb.3.done: - liveins: $sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $sgpr0 - - S_WAITCNT 127 - $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc - $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) - $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 - $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec - S_WAITCNT 3952 - FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into %ir.out) - S_ENDPGM 0 - -... diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-region.mir +++ /dev/null @@ -1,1054 +0,0 @@ -# RUN: llc -march=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck -check-prefix=GCN %s - ---- - -# GCN-LABEL: name: load_singlethread_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_singlethread_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_singlethread_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_singlethread_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_singlethread_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_singlethread_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_singlethread_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_singlethread_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_wavefront_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_wavefront_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_wavefront_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_wavefront_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_wavefront_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_wavefront_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_wavefront_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_wavefront_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_workgroup_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_workgroup_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_workgroup_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_workgroup_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_workgroup_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_workgroup_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_workgroup_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_workgroup_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_agent_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_agent_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_agent_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_agent_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_agent_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_agent_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_agent_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_agent_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_system_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_system_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_system_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_system_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_system_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_system_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: load_system_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_READ_B32 -# GCN-NOT: S_WAITCNT -# GCN: FLAT_STORE_DWORD - -name: load_system_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(2)* undef`) - $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec - FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_singlethread_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_singlethread_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_singlethread_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_singlethread_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_singlethread_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_singlethread_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_singlethread_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_singlethread_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_wavefront_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_wavefront_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_wavefront_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_wavefront_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_wavefront_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_wavefront_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_wavefront_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_wavefront_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_workgroup_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_workgroup_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_workgroup_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_workgroup_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_workgroup_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_workgroup_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_workgroup_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_workgroup_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_agent_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_agent_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_agent_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_agent_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_agent_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_agent_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_agent_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_agent_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_system_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_system_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store unordered 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_system_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_system_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store monotonic 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_system_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_system_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: store_system_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRITE_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: store_system_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_unordered - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_unordered -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_monotonic - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_monotonic -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_acquire - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_acquire -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_release - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_release -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_acq_rel - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_acq_rel -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... ---- - -# GCN-LABEL: name: atomicrmw_singlethread_seq_cst - -# GCN-LABEL: bb.0: -# GCN-NOT: S_WAITCNT -# GCN: DS_WRXCHG_RTN_B32 -# GCN-NOT: S_WAITCNT -# GCN: S_ENDPGM 0 - -name: atomicrmw_singlethread_seq_cst -body: | - bb.0: - $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) - $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) - $m0 = S_MOV_B32 -1 - $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec - $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec - $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) - S_ENDPGM 0 - -... diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll @@ -1,4 +1,6 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,DWORD %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,DWORDX2 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,DWORDX2 %s ; Effectively, check that the compile finishes; in the case ; of an infinite loop, llc toggles between merging 2 ST4s @@ -8,8 +10,12 @@ target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5" ; GCN-LABEL: {{^}}_Z6brokenPd: -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} + +; DWORD: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} +; DWORD: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} + +; DWORDX2: flat_store_dwordx2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] + define amdgpu_kernel void @_Z6brokenPd(double* %arg) { bb: %tmp = alloca double, align 8, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ /dev/null @@ -1,754 +0,0 @@ -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s - -declare i32 @llvm.amdgcn.workitem.id.x() - -; GCN-LABEL: {{^}}system_one_as_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel system_one_as_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_unordered( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel system_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_monotonic( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_release: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel system_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_release( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("one-as") release, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_one_as_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel system_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_one_as_seq_cst( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4 - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel singlethread_one_as_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_unordered( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4 - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel singlethread_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_monotonic( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4 - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel singlethread_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_release( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4 - ret void -} - -; GCN-LABEL: {{^}}singlethread_one_as_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_one_as_seq_cst( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel agent_one_as_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_unordered( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel agent_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_monotonic( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_release: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel agent_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_release( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_one_as_seq_cst: -; GCN: s_waitcnt vmcnt(0){{$}} -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel agent_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_one_as_seq_cst( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel workgroup_one_as_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_unordered( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel workgroup_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_monotonic( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_release: -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel workgroup_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_release( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_one_as_seq_cst: -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_one_as_seq_cst( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4 - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel wavefront_one_as_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_unordered( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4 - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel wavefront_one_as_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_monotonic( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4 - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel wavefront_one_as_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_release( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4 - ret void -} - -; GCN-LABEL: {{^}}wavefront_one_as_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_one_as_seq_cst( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4 - ret void -} - -; GCN-LABEL: {{^}}nontemporal_private_0: -; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} -; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_private_0 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_private_0( - i32* %in, i32 addrspace(5)* %out) { -entry: - %val = load i32, i32* %in, align 4 - store i32 %val, i32 addrspace(5)* %out, !nontemporal !0 - ret void -} - -; GCN-LABEL: {{^}}nontemporal_private_1: -; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} -; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_private_1 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_private_1( - i32* %in, i32 addrspace(5)* %out) { -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val = load i32, i32* %in, align 4 - %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid - store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0 - ret void -} - -; GCN-LABEL: {{^}}nontemporal_global_0: -; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} -; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_global_0 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_global_0( - i32* %in, i32 addrspace(1)* %out) { -entry: - %val = load i32, i32* %in, align 4 - store i32 %val, i32 addrspace(1)* %out, !nontemporal !0 - ret void -} - -; GCN-LABEL: {{^}}nontemporal_global_1: -; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} -; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX10: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_global_1 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_global_1( - i32* %in, i32 addrspace(1)* %out) { -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val = load i32, i32* %in, align 4 - %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid - store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0 - ret void -} - -; GCN-LABEL: {{^}}nontemporal_local_0: -; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel nontemporal_local_0 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_local_0( - i32* %in, i32 addrspace(3)* %out) { -entry: - %val = load i32, i32* %in, align 4 - store i32 %val, i32 addrspace(3)* %out, !nontemporal !0 - ret void -} - -; GCN-LABEL: {{^}}nontemporal_local_1: -; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel nontemporal_local_1 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_local_1( - i32* %in, i32 addrspace(3)* %out) { -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val = load i32, i32* %in, align 4 - %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid - store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0 - ret void -} - -; GCN-LABEL: {{^}}nontemporal_flat_0: -; GFX89: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} -; GFX10: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_flat_0 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_flat_0( - i32* %in, i32* %out) { -entry: - %val = load i32, i32* %in, align 4 - store i32 %val, i32* %out, !nontemporal !0 - ret void -} - -; GCN-LABEL: {{^}}nontemporal_flat_1: -; GFX89: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} -; GFX10: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} slc{{$}} -; GFX10: .amdhsa_kernel nontemporal_flat_1 -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @nontemporal_flat_1( - i32* %in, i32* %out) { -entry: - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %val = load i32, i32* %in, align 4 - %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid - store i32 %val, i32* %out.gep, !nontemporal !0 - ret void -} - -; GCN-LABEL: {{^}}system_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel system_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_unordered( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out unordered, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel system_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_monotonic( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out monotonic, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_release: -; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel system_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_release( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out release, align 4 - ret void -} - -; GCN-LABEL: {{^}}system_seq_cst: -; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel system_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @system_seq_cst( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out seq_cst, align 4 - ret void -} - -; GCN-LABEL: {{^}}singlethread_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel singlethread_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_unordered( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 - ret void -} - -; GCN-LABEL: {{^}}singlethread_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel singlethread_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_monotonic( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 - ret void -} - -; GCN-LABEL: {{^}}singlethread_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel singlethread_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_release( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 - ret void -} - -; GCN-LABEL: {{^}}singlethread_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel singlethread_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @singlethread_seq_cst( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel agent_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_unordered( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel agent_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_monotonic( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_release: -; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel agent_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_release( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("agent") release, align 4 - ret void -} - -; GCN-LABEL: {{^}}agent_seq_cst: -; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10: s_waitcnt lgkmcnt(0){{$}} -; GFX10: s_waitcnt_vscnt null, 0x0{{$}} -; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel agent_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @agent_seq_cst( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel workgroup_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_unordered( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel workgroup_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_monotonic( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_release: -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel workgroup_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_release( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4 - ret void -} - -; GCN-LABEL: {{^}}workgroup_seq_cst: -; GFX89-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}} -; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}} -; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel workgroup_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @workgroup_seq_cst( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4 - ret void -} - -; GCN-LABEL: {{^}}wavefront_unordered: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel wavefront_unordered -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_unordered( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 - ret void -} - -; GCN-LABEL: {{^}}wavefront_monotonic: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel wavefront_monotonic -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_monotonic( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 - ret void -} - -; GCN-LABEL: {{^}}wavefront_release: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel wavefront_release -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_release( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 - ret void -} - -; GCN-LABEL: {{^}}wavefront_seq_cst: -; GCN-NOT: s_waitcnt vmcnt(0){{$}} -; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}} -; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}} -; GFX10: .amdhsa_kernel wavefront_seq_cst -; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 -; GFX10CU: .amdhsa_workgroup_processor_mode 0 -; GFX10-NOT: .amdhsa_memory_ordered 0 -define amdgpu_kernel void @wavefront_seq_cst( - i32 %in, i32* %out) { -entry: - store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 - ret void -} - -!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer.ll @@ -0,0 +1,53378 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s + +declare i32 @llvm.amdgcn.workitem.id.x() + +define amdgpu_kernel void @private_nontemporal_load_0( +; GFX7-LABEL: private_nontemporal_load_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_add_u32 s8, s8, s7 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_nontemporal_load_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 +; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen slc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_nontemporal_load_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 +; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen slc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(5)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @private_nontemporal_load_1( +; GFX7-LABEL: private_nontemporal_load_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_add_u32 s8, s8, s7 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen glc slc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_nontemporal_load_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 +; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-WGP-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen slc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_nontemporal_load_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 +; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-CU-NEXT: buffer_load_dword v2, v0, s[8:11], 0 offen slc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(5)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid + %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_nontemporal_load_0( +; GFX7-LABEL: global_nontemporal_load_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_nontemporal_load_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_nontemporal_load_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_nontemporal_load_1( +; GFX7-LABEL: global_nontemporal_load_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX7-NEXT: flat_load_dword v2, v[2:3] glc slc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_nontemporal_load_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_load_dword v2, v0, s[0:1] slc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_nontemporal_load_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v2, v0, s[0:1] slc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid + %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @local_nontemporal_load_0( +; GFX7-LABEL: local_nontemporal_load_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_nontemporal_load_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: ds_read_b32 v2, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_nontemporal_load_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: ds_read_b32 v2, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @local_nontemporal_load_1( +; GFX7-LABEL: local_nontemporal_load_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_nontemporal_load_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-WGP-NEXT: ds_read_b32 v2, v0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_nontemporal_load_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-CU-NEXT: ds_read_b32 v2, v0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid + %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @flat_nontemporal_load_0( +; GFX7-LABEL: flat_nontemporal_load_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc slc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_nontemporal_load_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_nontemporal_load_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load i32, i32* %in, align 4, !nontemporal !0 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_nontemporal_load_1( +; GFX7-LABEL: flat_nontemporal_load_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, s1 +; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2 +; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX7-NEXT: flat_load_dword v2, v[2:3] glc slc +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_nontemporal_load_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s0, v0 +; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] slc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_nontemporal_load_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s0, v0 +; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] slc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid + %val = load i32, i32* %val.gep, align 4, !nontemporal !0 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @private_nontemporal_store_0( +; GFX7-LABEL: private_nontemporal_store_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_add_u32 s8, s8, s7 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen glc slc +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_nontemporal_store_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 +; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_nontemporal_store_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 +; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen slc +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(5)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4 + store i32 %val, i32 addrspace(5)* %out, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @private_nontemporal_store_1( +; GFX7-LABEL: private_nontemporal_store_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX7-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_add_u32 s8, s8, s7 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen glc slc +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: private_nontemporal_store_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-WGP-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_add_u32 s8, s8, s7 +; GFX10-WGP-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: private_nontemporal_store_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_mov_b64 s[10:11], s[2:3] +; GFX10-CU-NEXT: s_mov_b64 s[8:9], s[0:1] +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_add_u32 s8, s8, s7 +; GFX10-CU-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen slc +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(5)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(1)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid + store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @global_nontemporal_store_0( +; GFX7-LABEL: global_nontemporal_store_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_nontemporal_store_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off slc +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_nontemporal_store_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off slc +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4 + store i32 %val, i32 addrspace(1)* %out, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @global_nontemporal_store_1( +; GFX7-LABEL: global_nontemporal_store_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_nontemporal_store_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3] slc +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_nontemporal_store_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3] slc +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(1)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @local_nontemporal_store_0( +; GFX7-LABEL: local_nontemporal_store_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_nontemporal_store_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_nontemporal_store_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(3)* %out) { +entry: + %val = load i32, i32 addrspace(1)* %in, align 4 + store i32 %val, i32 addrspace(3)* %out, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @local_nontemporal_store_1( +; GFX7-LABEL: local_nontemporal_store_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_nontemporal_store_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_nontemporal_store_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(3)* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32 addrspace(1)* %in, align 4 + %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid + store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @flat_nontemporal_store_0( +; GFX7-LABEL: flat_nontemporal_store_0: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 glc slc +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_nontemporal_store_0: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_nontemporal_store_0: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load i32, i32* %in, align 4 + store i32 %val, i32* %out, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @flat_nontemporal_store_1( +; GFX7-LABEL: flat_nontemporal_store_1: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: flat_load_dword v2, v[1:2] +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 glc slc +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_nontemporal_store_1: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[1:2] +; GFX10-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 slc +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_nontemporal_store_1: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-CU-NEXT: flat_load_dword v2, v[1:2] +; GFX10-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 slc +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %val = load i32, i32* %in, align 4 + %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid + store i32 %val, i32* %out.gep, !nontemporal !0 + ret void +} + +define amdgpu_kernel void @flat_singlethread_unordered_load( +; GFX7-LABEL: flat_singlethread_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_monotonic_load( +; GFX7-LABEL: flat_singlethread_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_acquire_load( +; GFX7-LABEL: flat_singlethread_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_load( +; GFX7-LABEL: flat_singlethread_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_unordered_store( +; GFX7-LABEL: flat_singlethread_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_monotonic_store( +; GFX7-LABEL: flat_singlethread_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_release_store( +; GFX7-LABEL: flat_singlethread_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_store( +; GFX7-LABEL: flat_singlethread_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( +; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( +; GFX7-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_release_atomicrmw( +; GFX7-LABEL: flat_singlethread_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release + ret void +} + +define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( +; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( +; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst + ret void +} + +define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @singlethread_acquire_fence() { +; GFX7-LABEL: singlethread_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: singlethread_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: singlethread_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread") acquire + ret void +} + +define amdgpu_kernel void @singlethread_release_fence() { +; GFX7-LABEL: singlethread_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: singlethread_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: singlethread_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread") release + ret void +} + +define amdgpu_kernel void @singlethread_acq_rel_fence() { +; GFX7-LABEL: singlethread_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: singlethread_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: singlethread_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread") acq_rel + ret void +} + +define amdgpu_kernel void @singlethread_seq_cst_fence() { +; GFX7-LABEL: singlethread_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: singlethread_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: singlethread_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread") seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_unordered_load( +; GFX7-LABEL: global_singlethread_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_monotonic_load( +; GFX7-LABEL: global_singlethread_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_load( +; GFX7-LABEL: global_singlethread_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_load( +; GFX7-LABEL: global_singlethread_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_unordered_store( +; GFX7-LABEL: global_singlethread_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_monotonic_store( +; GFX7-LABEL: global_singlethread_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_release_store( +; GFX7-LABEL: global_singlethread_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") release, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_store( +; GFX7-LABEL: global_singlethread_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( +; GFX7-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( +; GFX7-LABEL: global_singlethread_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_release_atomicrmw( +; GFX7-LABEL: global_singlethread_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") release + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( +; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( +; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( +; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( +; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( +; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( +; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( +; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( +; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( +; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_unordered_load( +; GFX7-LABEL: local_singlethread_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_monotonic_load( +; GFX7-LABEL: local_singlethread_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_load( +; GFX7-LABEL: local_singlethread_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_load( +; GFX7-LABEL: local_singlethread_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_unordered_store( +; GFX7-LABEL: local_singlethread_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_monotonic_store( +; GFX7-LABEL: local_singlethread_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_release_store( +; GFX7-LABEL: local_singlethread_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") release, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_store( +; GFX7-LABEL: local_singlethread_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( +; GFX7-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( +; GFX7-LABEL: local_singlethread_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_release_atomicrmw( +; GFX7-LABEL: local_singlethread_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") release + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( +; GFX7-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( +; GFX7-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( +; GFX7-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( +; GFX7-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( +; GFX7-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( +; GFX7-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( +; GFX7-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( +; GFX7-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( +; GFX7-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( +; GFX7-LABEL: flat_singlethread_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( +; GFX7-LABEL: flat_singlethread_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( +; GFX7-LABEL: flat_singlethread_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_release_store( +; GFX7-LABEL: flat_singlethread_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @singlethread_one_as_acquire_fence() { +; GFX7-LABEL: singlethread_one_as_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: singlethread_one_as_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: singlethread_one_as_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @singlethread_one_as_release_fence() { +; GFX7-LABEL: singlethread_one_as_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: singlethread_one_as_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: singlethread_one_as_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { +; GFX7-LABEL: singlethread_one_as_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: singlethread_one_as_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: singlethread_one_as_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { +; GFX7-LABEL: singlethread_one_as_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: singlethread_one_as_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: singlethread_one_as_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("singlethread-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_unordered_load( +; GFX7-LABEL: global_singlethread_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( +; GFX7-LABEL: global_singlethread_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_load( +; GFX7-LABEL: global_singlethread_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( +; GFX7-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_unordered_store( +; GFX7-LABEL: global_singlethread_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( +; GFX7-LABEL: global_singlethread_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_store( +; GFX7-LABEL: global_singlethread_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( +; GFX7-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( +; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( +; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( +; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_unordered_load( +; GFX7-LABEL: local_singlethread_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( +; GFX7-LABEL: local_singlethread_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_load( +; GFX7-LABEL: local_singlethread_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( +; GFX7-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_unordered_store( +; GFX7-LABEL: local_singlethread_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( +; GFX7-LABEL: local_singlethread_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_store( +; GFX7-LABEL: local_singlethread_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( +; GFX7-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( +; GFX7-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( +; GFX7-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( +; GFX7-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") release + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("singlethread-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_unordered_load( +; GFX7-LABEL: flat_wavefront_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_monotonic_load( +; GFX7-LABEL: flat_wavefront_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_acquire_load( +; GFX7-LABEL: flat_wavefront_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_load( +; GFX7-LABEL: flat_wavefront_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_unordered_store( +; GFX7-LABEL: flat_wavefront_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_monotonic_store( +; GFX7-LABEL: flat_wavefront_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_release_store( +; GFX7-LABEL: flat_wavefront_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_store( +; GFX7-LABEL: flat_wavefront_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( +; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( +; GFX7-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_release_atomicrmw( +; GFX7-LABEL: flat_wavefront_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release + ret void +} + +define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( +; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( +; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst + ret void +} + +define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @wavefront_acquire_fence() { +; GFX7-LABEL: wavefront_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: wavefront_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: wavefront_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront") acquire + ret void +} + +define amdgpu_kernel void @wavefront_release_fence() { +; GFX7-LABEL: wavefront_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: wavefront_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: wavefront_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront") release + ret void +} + +define amdgpu_kernel void @wavefront_acq_rel_fence() { +; GFX7-LABEL: wavefront_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: wavefront_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: wavefront_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront") acq_rel + ret void +} + +define amdgpu_kernel void @wavefront_seq_cst_fence() { +; GFX7-LABEL: wavefront_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: wavefront_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: wavefront_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront") seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_unordered_load( +; GFX7-LABEL: global_wavefront_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_monotonic_load( +; GFX7-LABEL: global_wavefront_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_load( +; GFX7-LABEL: global_wavefront_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_load( +; GFX7-LABEL: global_wavefront_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_unordered_store( +; GFX7-LABEL: global_wavefront_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_monotonic_store( +; GFX7-LABEL: global_wavefront_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_release_store( +; GFX7-LABEL: global_wavefront_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") release, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_store( +; GFX7-LABEL: global_wavefront_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( +; GFX7-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( +; GFX7-LABEL: global_wavefront_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_release_atomicrmw( +; GFX7-LABEL: global_wavefront_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") release + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( +; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( +; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( +; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( +; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( +; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( +; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( +; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( +; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( +; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_unordered_load( +; GFX7-LABEL: local_wavefront_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_monotonic_load( +; GFX7-LABEL: local_wavefront_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_load( +; GFX7-LABEL: local_wavefront_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_load( +; GFX7-LABEL: local_wavefront_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_unordered_store( +; GFX7-LABEL: local_wavefront_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_monotonic_store( +; GFX7-LABEL: local_wavefront_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_release_store( +; GFX7-LABEL: local_wavefront_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") release, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_store( +; GFX7-LABEL: local_wavefront_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( +; GFX7-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( +; GFX7-LABEL: local_wavefront_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_release_atomicrmw( +; GFX7-LABEL: local_wavefront_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") release + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( +; GFX7-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( +; GFX7-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( +; GFX7-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( +; GFX7-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( +; GFX7-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( +; GFX7-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( +; GFX7-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( +; GFX7-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( +; GFX7-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( +; GFX7-LABEL: flat_wavefront_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( +; GFX7-LABEL: flat_wavefront_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( +; GFX7-LABEL: flat_wavefront_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_release_store( +; GFX7-LABEL: flat_wavefront_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @wavefront_one_as_acquire_fence() { +; GFX7-LABEL: wavefront_one_as_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: wavefront_one_as_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: wavefront_one_as_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @wavefront_one_as_release_fence() { +; GFX7-LABEL: wavefront_one_as_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: wavefront_one_as_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: wavefront_one_as_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { +; GFX7-LABEL: wavefront_one_as_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: wavefront_one_as_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: wavefront_one_as_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { +; GFX7-LABEL: wavefront_one_as_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: wavefront_one_as_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: wavefront_one_as_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("wavefront-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_unordered_load( +; GFX7-LABEL: global_wavefront_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( +; GFX7-LABEL: global_wavefront_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_load( +; GFX7-LABEL: global_wavefront_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( +; GFX7-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_unordered_store( +; GFX7-LABEL: global_wavefront_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( +; GFX7-LABEL: global_wavefront_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_store( +; GFX7-LABEL: global_wavefront_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( +; GFX7-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( +; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( +; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( +; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_unordered_load( +; GFX7-LABEL: local_wavefront_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( +; GFX7-LABEL: local_wavefront_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_load( +; GFX7-LABEL: local_wavefront_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( +; GFX7-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_unordered_store( +; GFX7-LABEL: local_wavefront_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( +; GFX7-LABEL: local_wavefront_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_store( +; GFX7-LABEL: local_wavefront_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( +; GFX7-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( +; GFX7-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( +; GFX7-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( +; GFX7-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") release + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("wavefront-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_unordered_load( +; GFX7-LABEL: flat_workgroup_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_monotonic_load( +; GFX7-LABEL: flat_workgroup_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_acquire_load( +; GFX7-LABEL: flat_workgroup_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_load( +; GFX7-LABEL: flat_workgroup_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_unordered_store( +; GFX7-LABEL: flat_workgroup_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_monotonic_store( +; GFX7-LABEL: flat_workgroup_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_release_store( +; GFX7-LABEL: flat_workgroup_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_store( +; GFX7-LABEL: flat_workgroup_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( +; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( +; GFX7-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_release_atomicrmw( +; GFX7-LABEL: flat_workgroup_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") release + ret void +} + +define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( +; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( +; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @workgroup_acquire_fence() { +; GFX7-LABEL: workgroup_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: workgroup_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: workgroup_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup") acquire + ret void +} + +define amdgpu_kernel void @workgroup_release_fence() { +; GFX7-LABEL: workgroup_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: workgroup_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: workgroup_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup") release + ret void +} + +define amdgpu_kernel void @workgroup_acq_rel_fence() { +; GFX7-LABEL: workgroup_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: workgroup_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: workgroup_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @workgroup_seq_cst_fence() { +; GFX7-LABEL: workgroup_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: workgroup_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: workgroup_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_unordered_load( +; GFX7-LABEL: global_workgroup_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_monotonic_load( +; GFX7-LABEL: global_workgroup_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_load( +; GFX7-LABEL: global_workgroup_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_load( +; GFX7-LABEL: global_workgroup_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_unordered_store( +; GFX7-LABEL: global_workgroup_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_monotonic_store( +; GFX7-LABEL: global_workgroup_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_release_store( +; GFX7-LABEL: global_workgroup_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_store( +; GFX7-LABEL: global_workgroup_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( +; GFX7-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( +; GFX7-LABEL: global_workgroup_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_release_atomicrmw( +; GFX7-LABEL: global_workgroup_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") release + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( +; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( +; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( +; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( +; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( +; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( +; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( +; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( +; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( +; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_unordered_load( +; GFX7-LABEL: local_workgroup_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_monotonic_load( +; GFX7-LABEL: local_workgroup_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_load( +; GFX7-LABEL: local_workgroup_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_load( +; GFX7-LABEL: local_workgroup_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_unordered_store( +; GFX7-LABEL: local_workgroup_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_monotonic_store( +; GFX7-LABEL: local_workgroup_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_release_store( +; GFX7-LABEL: local_workgroup_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_store( +; GFX7-LABEL: local_workgroup_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( +; GFX7-LABEL: local_workgroup_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( +; GFX7-LABEL: local_workgroup_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_release_atomicrmw( +; GFX7-LABEL: local_workgroup_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") release + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( +; GFX7-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( +; GFX7-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( +; GFX7-LABEL: local_workgroup_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( +; GFX7-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( +; GFX7-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( +; GFX7-LABEL: local_workgroup_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( +; GFX7-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( +; GFX7-LABEL: local_workgroup_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( +; GFX7-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( +; GFX7-LABEL: flat_workgroup_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( +; GFX7-LABEL: flat_workgroup_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( +; GFX7-LABEL: flat_workgroup_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_release_store( +; GFX7-LABEL: flat_workgroup_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @workgroup_one_as_acquire_fence() { +; GFX7-LABEL: workgroup_one_as_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: workgroup_one_as_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: workgroup_one_as_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @workgroup_one_as_release_fence() { +; GFX7-LABEL: workgroup_one_as_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: workgroup_one_as_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: workgroup_one_as_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { +; GFX7-LABEL: workgroup_one_as_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: workgroup_one_as_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: workgroup_one_as_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { +; GFX7-LABEL: workgroup_one_as_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: workgroup_one_as_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: workgroup_one_as_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("workgroup-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_unordered_load( +; GFX7-LABEL: global_workgroup_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( +; GFX7-LABEL: global_workgroup_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_load( +; GFX7-LABEL: global_workgroup_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( +; GFX7-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_unordered_store( +; GFX7-LABEL: global_workgroup_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( +; GFX7-LABEL: global_workgroup_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_store( +; GFX7-LABEL: global_workgroup_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( +; GFX7-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( +; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( +; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( +; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_unordered_load( +; GFX7-LABEL: local_workgroup_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( +; GFX7-LABEL: local_workgroup_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_load( +; GFX7-LABEL: local_workgroup_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( +; GFX7-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_unordered_store( +; GFX7-LABEL: local_workgroup_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( +; GFX7-LABEL: local_workgroup_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_store( +; GFX7-LABEL: local_workgroup_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( +; GFX7-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( +; GFX7-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( +; GFX7-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( +; GFX7-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_unordered_load( +; GFX7-LABEL: flat_agent_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_monotonic_load( +; GFX7-LABEL: flat_agent_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_acquire_load( +; GFX7-LABEL: flat_agent_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_load( +; GFX7-LABEL: flat_agent_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_unordered_store( +; GFX7-LABEL: flat_agent_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_monotonic_store( +; GFX7-LABEL: flat_agent_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_release_store( +; GFX7-LABEL: flat_agent_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_store( +; GFX7-LABEL: flat_agent_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( +; GFX7-LABEL: flat_agent_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_acquire_atomicrmw( +; GFX7-LABEL: flat_agent_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire + ret void +} + +define amdgpu_kernel void @flat_agent_release_atomicrmw( +; GFX7-LABEL: flat_agent_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") release + ret void +} + +define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( +; GFX7-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( +; GFX7-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire + ret void +} + +define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @agent_acquire_fence() { +; GFX7-LABEL: agent_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: agent_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: agent_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent") acquire + ret void +} + +define amdgpu_kernel void @agent_release_fence() { +; GFX7-LABEL: agent_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: agent_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: agent_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent") release + ret void +} + +define amdgpu_kernel void @agent_acq_rel_fence() { +; GFX7-LABEL: agent_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: agent_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: agent_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent") acq_rel + ret void +} + +define amdgpu_kernel void @agent_seq_cst_fence() { +; GFX7-LABEL: agent_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: agent_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: agent_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_unordered_load( +; GFX7-LABEL: global_agent_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_monotonic_load( +; GFX7-LABEL: global_agent_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_acquire_load( +; GFX7-LABEL: global_agent_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_load( +; GFX7-LABEL: global_agent_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_unordered_store( +; GFX7-LABEL: global_agent_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_monotonic_store( +; GFX7-LABEL: global_agent_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_release_store( +; GFX7-LABEL: global_agent_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") release, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_store( +; GFX7-LABEL: global_agent_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_monotonic_atomicrmw( +; GFX7-LABEL: global_agent_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @global_agent_acquire_atomicrmw( +; GFX7-LABEL: global_agent_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire + ret void +} + +define amdgpu_kernel void @global_agent_release_atomicrmw( +; GFX7-LABEL: global_agent_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") release + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( +; GFX7-LABEL: global_agent_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( +; GFX7-LABEL: global_agent_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( +; GFX7-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( +; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( +; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( +; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( +; GFX7-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release monotonic + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( +; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + ret void +} + +define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( +; GFX7-LABEL: global_agent_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( +; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_unordered_load( +; GFX7-LABEL: local_agent_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_monotonic_load( +; GFX7-LABEL: local_agent_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_acquire_load( +; GFX7-LABEL: local_agent_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_load( +; GFX7-LABEL: local_agent_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_unordered_store( +; GFX7-LABEL: local_agent_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_monotonic_store( +; GFX7-LABEL: local_agent_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_release_store( +; GFX7-LABEL: local_agent_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") release, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_store( +; GFX7-LABEL: local_agent_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_monotonic_atomicrmw( +; GFX7-LABEL: local_agent_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") monotonic + ret void +} + +define amdgpu_kernel void @local_agent_acquire_atomicrmw( +; GFX7-LABEL: local_agent_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire + ret void +} + +define amdgpu_kernel void @local_agent_release_atomicrmw( +; GFX7-LABEL: local_agent_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") release + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( +; GFX7-LABEL: local_agent_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( +; GFX7-LABEL: local_agent_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst + ret void +} + +define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( +; GFX7-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( +; GFX7-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( +; GFX7-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( +; GFX7-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( +; GFX7-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release monotonic + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( +; GFX7-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + ret void +} + +define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( +; GFX7-LABEL: local_agent_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( +; GFX7-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_unordered_load( +; GFX7-LABEL: flat_agent_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_monotonic_load( +; GFX7-LABEL: flat_agent_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acquire_load( +; GFX7-LABEL: flat_agent_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( +; GFX7-LABEL: flat_agent_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_unordered_store( +; GFX7-LABEL: flat_agent_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_monotonic_store( +; GFX7-LABEL: flat_agent_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_release_store( +; GFX7-LABEL: flat_agent_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( +; GFX7-LABEL: flat_agent_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @agent_one_as_acquire_fence() { +; GFX7-LABEL: agent_one_as_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: agent_one_as_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: agent_one_as_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @agent_one_as_release_fence() { +; GFX7-LABEL: agent_one_as_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: agent_one_as_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: agent_one_as_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @agent_one_as_acq_rel_fence() { +; GFX7-LABEL: agent_one_as_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: agent_one_as_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: agent_one_as_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @agent_one_as_seq_cst_fence() { +; GFX7-LABEL: agent_one_as_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: agent_one_as_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: agent_one_as_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("agent-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_one_as_unordered_load( +; GFX7-LABEL: global_agent_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_load( +; GFX7-LABEL: global_agent_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_load( +; GFX7-LABEL: global_agent_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_load( +; GFX7-LABEL: global_agent_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_agent_one_as_unordered_store( +; GFX7-LABEL: global_agent_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_store( +; GFX7-LABEL: global_agent_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_store( +; GFX7-LABEL: global_agent_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_store( +; GFX7-LABEL: global_agent_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( +; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( +; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( +; GFX7-LABEL: global_agent_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_unordered_load( +; GFX7-LABEL: local_agent_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_load( +; GFX7-LABEL: local_agent_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_load( +; GFX7-LABEL: local_agent_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_load( +; GFX7-LABEL: local_agent_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_agent_one_as_unordered_store( +; GFX7-LABEL: local_agent_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_store( +; GFX7-LABEL: local_agent_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_store( +; GFX7-LABEL: local_agent_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_store( +; GFX7-LABEL: local_agent_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( +; GFX7-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( +; GFX7-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( +; GFX7-LABEL: local_agent_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") release + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("agent-one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_unordered_load( +; GFX7-LABEL: flat_system_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_monotonic_load( +; GFX7-LABEL: flat_system_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_acquire_load( +; GFX7-LABEL: flat_system_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_load( +; GFX7-LABEL: flat_system_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_unordered_store( +; GFX7-LABEL: flat_system_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_monotonic_store( +; GFX7-LABEL: flat_system_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_release_store( +; GFX7-LABEL: flat_system_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out release, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_store( +; GFX7-LABEL: flat_system_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_monotonic_atomicrmw( +; GFX7-LABEL: flat_system_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in monotonic + ret void +} + +define amdgpu_kernel void @flat_system_acquire_atomicrmw( +; GFX7-LABEL: flat_system_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in acquire + ret void +} + +define amdgpu_kernel void @flat_system_release_atomicrmw( +; GFX7-LABEL: flat_system_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in release + ret void +} + +define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( +; GFX7-LABEL: flat_system_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( +; GFX7-LABEL: flat_system_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in acquire + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release monotonic + ret void +} + +define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire + ret void +} + +define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( +; GFX7-LABEL: flat_system_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire + ret void +} + +define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @system_acquire_fence() { +; GFX7-LABEL: system_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: system_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: system_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence acquire + ret void +} + +define amdgpu_kernel void @system_release_fence() { +; GFX7-LABEL: system_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: system_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: system_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +entry: + fence release + ret void +} + +define amdgpu_kernel void @system_acq_rel_fence() { +; GFX7-LABEL: system_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: system_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: system_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence acq_rel + ret void +} + +define amdgpu_kernel void @system_seq_cst_fence() { +; GFX7-LABEL: system_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: system_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: system_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence seq_cst + ret void +} + +define amdgpu_kernel void @global_system_unordered_load( +; GFX7-LABEL: global_system_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_monotonic_load( +; GFX7-LABEL: global_system_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_acquire_load( +; GFX7-LABEL: global_system_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_load( +; GFX7-LABEL: global_system_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_unordered_store( +; GFX7-LABEL: global_system_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_system_monotonic_store( +; GFX7-LABEL: global_system_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_system_release_store( +; GFX7-LABEL: global_system_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out release, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_store( +; GFX7-LABEL: global_system_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_system_monotonic_atomicrmw( +; GFX7-LABEL: global_system_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in monotonic + ret void +} + +define amdgpu_kernel void @global_system_acquire_atomicrmw( +; GFX7-LABEL: global_system_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire + ret void +} + +define amdgpu_kernel void @global_system_release_atomicrmw( +; GFX7-LABEL: global_system_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in release + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_atomicrmw( +; GFX7-LABEL: global_system_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_atomicrmw( +; GFX7-LABEL: global_system_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( +; GFX7-LABEL: global_system_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( +; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( +; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( +; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire monotonic + ret void +} + +define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( +; GFX7-LABEL: global_system_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release monotonic + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( +; GFX7-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire acquire + ret void +} + +define amdgpu_kernel void @global_system_release_acquire_cmpxchg( +; GFX7-LABEL: global_system_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release acquire + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( +; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_unordered_load( +; GFX7-LABEL: local_system_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_monotonic_load( +; GFX7-LABEL: local_system_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_acquire_load( +; GFX7-LABEL: local_system_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_load( +; GFX7-LABEL: local_system_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_unordered_store( +; GFX7-LABEL: local_system_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_system_monotonic_store( +; GFX7-LABEL: local_system_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_system_release_store( +; GFX7-LABEL: local_system_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out release, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_store( +; GFX7-LABEL: local_system_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_system_monotonic_atomicrmw( +; GFX7-LABEL: local_system_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in monotonic + ret void +} + +define amdgpu_kernel void @local_system_acquire_atomicrmw( +; GFX7-LABEL: local_system_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire + ret void +} + +define amdgpu_kernel void @local_system_release_atomicrmw( +; GFX7-LABEL: local_system_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in release + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_atomicrmw( +; GFX7-LABEL: local_system_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_atomicrmw( +; GFX7-LABEL: local_system_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst + ret void +} + +define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( +; GFX7-LABEL: local_system_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( +; GFX7-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( +; GFX7-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( +; GFX7-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire monotonic + ret void +} + +define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( +; GFX7-LABEL: local_system_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release monotonic + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( +; GFX7-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire acquire + ret void +} + +define amdgpu_kernel void @local_system_release_acquire_cmpxchg( +; GFX7-LABEL: local_system_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release acquire + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( +; GFX7-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_unordered_load( +; GFX7-LABEL: flat_system_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_one_as_monotonic_load( +; GFX7-LABEL: flat_system_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acquire_load( +; GFX7-LABEL: flat_system_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_load( +; GFX7-LABEL: flat_system_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_load_dword v2, v[0:1] glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %in, i32* %out) { +entry: + %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4 + store i32 %val, i32* %out + ret void +} + +define amdgpu_kernel void @flat_system_one_as_unordered_store( +; GFX7-LABEL: flat_system_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_monotonic_store( +; GFX7-LABEL: flat_system_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_release_store( +; GFX7-LABEL: flat_system_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_store( +; GFX7-LABEL: flat_system_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32* %out) { +entry: + store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( +; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( +; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( +; GFX7-LABEL: flat_system_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst + store i32 %val, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 +; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 +; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: flat_store_dword v[0:1], v2 +; GFX10-CU-NEXT: s_endpgm + i32* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = cmpxchg volatile i32* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32* %out, align 4 + ret void +} + +define amdgpu_kernel void @system_one_as_acquire_fence() { +; GFX7-LABEL: system_one_as_acquire_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: system_one_as_acquire_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: system_one_as_acquire_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @system_one_as_release_fence() { +; GFX7-LABEL: system_one_as_release_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: system_one_as_release_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: system_one_as_release_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @system_one_as_acq_rel_fence() { +; GFX7-LABEL: system_one_as_acq_rel_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: system_one_as_acq_rel_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: system_one_as_acq_rel_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @system_one_as_seq_cst_fence() { +; GFX7-LABEL: system_one_as_seq_cst_fence: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: system_one_as_seq_cst_fence: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: system_one_as_seq_cst_fence: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm +entry: + fence syncscope("one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_system_one_as_unordered_load( +; GFX7-LABEL: global_system_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_load( +; GFX7-LABEL: global_system_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_load( +; GFX7-LABEL: global_system_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_load( +; GFX7-LABEL: global_system_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_load_dword v0, v[0:1] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_store_dword v[2:3], v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_load_dword v2, v[0:1], off glc dlc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_system_one_as_unordered_store( +; GFX7-LABEL: global_system_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_store( +; GFX7-LABEL: global_system_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_store( +; GFX7-LABEL: global_system_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_store( +; GFX7-LABEL: global_system_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(1)* %out) { +entry: + store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( +; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( +; GFX7-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_atomicrmw( +; GFX7-LABEL: global_system_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v[0:1], v2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v[0:1], v2, off +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_swap v2, v[0:1], v2, off glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s0, s0, 16 +; GFX7-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:16 +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_add_u32 s4, s0, 16 +; GFX7-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_clause 0x1 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) +; GFX10-WGP-NEXT: buffer_gl0_inv +; GFX10-WGP-NEXT: buffer_gl1_inv +; GFX10-WGP-NEXT: global_store_dword v[0:1], v2, off +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_clause 0x1 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:16 glc +; GFX10-CU-NEXT: s_waitcnt vmcnt(0) +; GFX10-CU-NEXT: buffer_gl0_inv +; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: global_store_dword v[0:1], v2, off +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(1)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_unordered_load( +; GFX7-LABEL: local_system_one_as_unordered_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_unordered_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_unordered_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") unordered, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_load( +; GFX7-LABEL: local_system_one_as_monotonic_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_monotonic_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_monotonic_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") monotonic, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_load( +; GFX7-LABEL: local_system_one_as_acquire_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") acquire, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_load( +; GFX7-LABEL: local_system_one_as_seq_cst_load: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: ds_read_b32 v0, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v1, v0 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_load: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_read_b32 v0, v0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v1, v0 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_load: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_read_b32 v0, v0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v1, v0 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") seq_cst, align 4 + store i32 %val, i32 addrspace(3)* %out + ret void +} + +define amdgpu_kernel void @local_system_one_as_unordered_store( +; GFX7-LABEL: local_system_one_as_unordered_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_unordered_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_unordered_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") unordered, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_store( +; GFX7-LABEL: local_system_one_as_monotonic_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_monotonic_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_monotonic_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") monotonic, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_store( +; GFX7-LABEL: local_system_one_as_release_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_release_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_release_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") release, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_store( +; GFX7-LABEL: local_system_one_as_seq_cst_store: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_store: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_store: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 %in, i32 addrspace(3)* %out) { +entry: + store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") seq_cst, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( +; GFX7-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( +; GFX7-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_atomicrmw( +; GFX7-LABEL: local_system_one_as_release_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_release_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_release_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") release + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( +; GFX7-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( +; GFX7-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( +; GFX7-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( +; GFX7-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( +; GFX7-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in) { +entry: + %val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst + store i32 %val, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( +; GFX7-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( +; GFX7-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( +; GFX7-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( +; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( +; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( +; GFX7-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( +; GFX7-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( +; GFX7-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( +; GFX7-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( +; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( +; GFX7-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( +; GFX7-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( +; GFX7-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") release acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( +; GFX7-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( +; GFX7-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( +; GFX7-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX7: ; %bb.0: ; %entry +; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_write_b32 v0, v1 +; GFX7-NEXT: s_endpgm +; +; GFX10-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-WGP: ; %bb.0: ; %entry +; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-WGP-NEXT: ; implicit-def: $vcc_hi +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-WGP-NEXT: ds_write_b32 v0, v1 +; GFX10-WGP-NEXT: s_endpgm +; +; GFX10-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX10-CU: ; %bb.0: ; %entry +; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX10-CU-NEXT: ; implicit-def: $vcc_hi +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 +; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-CU-NEXT: ds_write_b32 v0, v1 +; GFX10-CU-NEXT: s_endpgm + i32 addrspace(3)* %out, i32 %in, i32 %old) { +entry: + %gep = getelementptr i32, i32 addrspace(3)* %out, i32 4 + %val = cmpxchg volatile i32 addrspace(3)* %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst + %val0 = extractvalue { i32, i1 } %val, 0 + store i32 %val0, i32 addrspace(3)* %out, align 4 + ret void +} + +!0 = !{i32 1} diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer.mir @@ -0,0 +1,4521 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=si-memory-legalizer -o - %s | FileCheck --check-prefixes=GFX7 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=si-memory-legalizer -o - %s | FileCheck --check-prefixes=GFX10-WGP %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -run-pass=si-memory-legalizer -o - %s | FileCheck --check-prefixes=GFX10-CU %s + +--- + +name: region_load_singlethread_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_load_singlethread_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_singlethread_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_singlethread_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 1, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_singlethread_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_load_singlethread_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_singlethread_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_singlethread_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_singlethread_acquire +body: | + bb.0: + ; GFX7-LABEL: name: region_load_singlethread_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_singlethread_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_singlethread_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_singlethread_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_load_singlethread_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_singlethread_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_singlethread_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_wavefront_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_load_wavefront_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_wavefront_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_wavefront_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_wavefront_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_load_wavefront_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_wavefront_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_wavefront_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_wavefront_acquire +body: | + bb.0: + ; GFX7-LABEL: name: region_load_wavefront_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_wavefront_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_wavefront_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_wavefront_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_load_wavefront_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_wavefront_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_wavefront_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_workgroup_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_load_workgroup_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_workgroup_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_workgroup_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_workgroup_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_load_workgroup_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_workgroup_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_workgroup_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_workgroup_acquire +body: | + bb.0: + ; GFX7-LABEL: name: region_load_workgroup_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_workgroup_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_workgroup_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_workgroup_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_load_workgroup_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_workgroup_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_workgroup_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_agent_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_load_agent_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_agent_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_agent_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_agent_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_load_agent_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_agent_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_agent_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_agent_acquire +body: | + bb.0: + ; GFX7-LABEL: name: region_load_agent_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_agent_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_agent_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_agent_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_load_agent_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_agent_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_agent_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_system_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_load_system_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_system_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_system_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_system_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_load_system_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_system_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_system_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_system_acquire +body: | + bb.0: + ; GFX7-LABEL: name: region_load_system_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_system_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_system_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_load_system_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_load_system_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_load_system_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_load_system_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 1, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(2)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_singlethread_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_store_singlethread_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_singlethread_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_singlethread_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_singlethread_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_store_singlethread_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_singlethread_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_singlethread_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_singlethread_release +body: | + bb.0: + ; GFX7-LABEL: name: region_store_singlethread_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_singlethread_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_singlethread_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_singlethread_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_store_singlethread_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_singlethread_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_singlethread_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_wavefront_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_store_wavefront_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_wavefront_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_wavefront_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_wavefront_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_store_wavefront_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_wavefront_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_wavefront_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_wavefront_release +body: | + bb.0: + ; GFX7-LABEL: name: region_store_wavefront_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_wavefront_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_wavefront_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_wavefront_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_store_wavefront_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_wavefront_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_wavefront_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_workgroup_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_store_workgroup_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_workgroup_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_workgroup_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_workgroup_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_store_workgroup_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_workgroup_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_workgroup_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_workgroup_release +body: | + bb.0: + ; GFX7-LABEL: name: region_store_workgroup_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_workgroup_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_workgroup_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_workgroup_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_store_workgroup_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_workgroup_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_workgroup_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_agent_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_store_agent_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_agent_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_agent_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_agent_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_store_agent_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_agent_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_agent_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_agent_release +body: | + bb.0: + ; GFX7-LABEL: name: region_store_agent_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_agent_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_agent_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_agent_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_store_agent_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_agent_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_agent_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_system_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_store_system_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_system_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_system_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store unordered 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_system_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_store_system_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_system_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_system_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store monotonic 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_system_release +body: | + bb.0: + ; GFX7-LABEL: name: region_store_system_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_system_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_system_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_store_system_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_store_system_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_store_system_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_store_system_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_atomicrmw_singlethread_unordered +body: | + bb.0: + ; GFX7-LABEL: name: region_atomicrmw_singlethread_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_atomicrmw_singlethread_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_atomicrmw_singlethread_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_atomicrmw_singlethread_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: region_atomicrmw_singlethread_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_atomicrmw_singlethread_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_atomicrmw_singlethread_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_atomicrmw_singlethread_acquire +body: | + bb.0: + ; GFX7-LABEL: name: region_atomicrmw_singlethread_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_atomicrmw_singlethread_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_atomicrmw_singlethread_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_atomicrmw_singlethread_release +body: | + bb.0: + ; GFX7-LABEL: name: region_atomicrmw_singlethread_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_atomicrmw_singlethread_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_atomicrmw_singlethread_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_atomicrmw_singlethread_acq_rel +body: | + bb.0: + ; GFX7-LABEL: name: region_atomicrmw_singlethread_acq_rel + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_atomicrmw_singlethread_acq_rel + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_atomicrmw_singlethread_acq_rel + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: region_atomicrmw_singlethread_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: region_atomicrmw_singlethread_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: region_atomicrmw_singlethread_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: region_atomicrmw_singlethread_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`, addrspace 2) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 1, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(2)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_singlethread_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_load_singlethread_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_singlethread_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_singlethread_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") unordered 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_singlethread_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_load_singlethread_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_singlethread_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_singlethread_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") monotonic 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_singlethread_acquire +body: | + bb.0: + ; GFX7-LABEL: name: local_load_singlethread_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_singlethread_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_singlethread_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") acquire 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_singlethread_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_load_singlethread_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_singlethread_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_singlethread_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("singlethread-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_wavefront_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_load_wavefront_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_wavefront_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_wavefront_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") unordered 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_wavefront_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_load_wavefront_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_wavefront_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_wavefront_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") monotonic 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_wavefront_acquire +body: | + bb.0: + ; GFX7-LABEL: name: local_load_wavefront_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_wavefront_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_wavefront_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") acquire 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_wavefront_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_load_wavefront_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_wavefront_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_wavefront_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("wavefront-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_workgroup_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_load_workgroup_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_workgroup_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_workgroup_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") unordered 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_workgroup_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_load_workgroup_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_workgroup_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_workgroup_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") monotonic 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_workgroup_acquire +body: | + bb.0: + ; GFX7-LABEL: name: local_load_workgroup_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_workgroup_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_workgroup_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") acquire 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_workgroup_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_load_workgroup_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_workgroup_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_workgroup_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("workgroup-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_agent_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_load_agent_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_agent_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_agent_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") unordered 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_agent_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_load_agent_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_agent_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_agent_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") monotonic 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_agent_acquire +body: | + bb.0: + ; GFX7-LABEL: name: local_load_agent_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_agent_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_agent_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") acquire 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_agent_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_load_agent_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_agent_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_agent_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("agent-one-as") seq_cst 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_system_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_load_system_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_system_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_system_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") unordered 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_system_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_load_system_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_system_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_system_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") monotonic 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_system_acquire +body: | + bb.0: + ; GFX7-LABEL: name: local_load_system_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_system_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_system_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") acquire 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_load_system_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_load_system_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX7: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_load_system_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-WGP: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_load_system_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + ; GFX10-CU: FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed $sgpr0_sgpr1, 44, 0, 0 :: (dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, align 4, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + renamable $vgpr2 = DS_READ_B32 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(3)* undef`) + $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr0_sgpr1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $sgpr0_sgpr1, implicit $exec + FLAT_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_singlethread_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_store_singlethread_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_singlethread_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_singlethread_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_singlethread_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_store_singlethread_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_singlethread_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_singlethread_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_singlethread_release +body: | + bb.0: + ; GFX7-LABEL: name: local_store_singlethread_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_singlethread_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_singlethread_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_singlethread_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_store_singlethread_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_singlethread_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_singlethread_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_wavefront_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_store_wavefront_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_wavefront_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_wavefront_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") unordered 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_wavefront_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_store_wavefront_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_wavefront_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_wavefront_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") monotonic 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_wavefront_release +body: | + bb.0: + ; GFX7-LABEL: name: local_store_wavefront_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_wavefront_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_wavefront_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") release 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_wavefront_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_store_wavefront_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_wavefront_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_wavefront_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("wavefront-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_workgroup_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_store_workgroup_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_workgroup_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_workgroup_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") unordered 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_workgroup_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_store_workgroup_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_workgroup_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_workgroup_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") monotonic 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_workgroup_release +body: | + bb.0: + ; GFX7-LABEL: name: local_store_workgroup_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_workgroup_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_workgroup_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") release 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_workgroup_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_store_workgroup_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_workgroup_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_workgroup_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("workgroup-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_agent_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_store_agent_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_agent_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_agent_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") unordered 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_agent_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_store_agent_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_agent_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_agent_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") monotonic 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_agent_release +body: | + bb.0: + ; GFX7-LABEL: name: local_store_agent_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_agent_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_agent_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") release 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_agent_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_store_agent_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_agent_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_agent_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("agent-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_system_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_store_system_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_system_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_system_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") unordered 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_system_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_store_system_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_system_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_system_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") monotonic 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_system_release +body: | + bb.0: + ; GFX7-LABEL: name: local_store_system_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_system_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_system_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") release 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_store_system_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_store_system_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_store_system_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_store_system_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("one-as") seq_cst 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_atomicrmw_singlethread_unordered +body: | + bb.0: + ; GFX7-LABEL: name: local_atomicrmw_singlethread_unordered + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_atomicrmw_singlethread_unordered + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_atomicrmw_singlethread_unordered + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") unordered 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_atomicrmw_singlethread_monotonic +body: | + bb.0: + ; GFX7-LABEL: name: local_atomicrmw_singlethread_monotonic + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_atomicrmw_singlethread_monotonic + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_atomicrmw_singlethread_monotonic + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") monotonic 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_atomicrmw_singlethread_acquire +body: | + bb.0: + ; GFX7-LABEL: name: local_atomicrmw_singlethread_acquire + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_atomicrmw_singlethread_acquire + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_atomicrmw_singlethread_acquire + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acquire 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_atomicrmw_singlethread_release +body: | + bb.0: + ; GFX7-LABEL: name: local_atomicrmw_singlethread_release + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_atomicrmw_singlethread_release + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_atomicrmw_singlethread_release + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") release 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_atomicrmw_singlethread_acq_rel +body: | + bb.0: + ; GFX7-LABEL: name: local_atomicrmw_singlethread_acq_rel + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_atomicrmw_singlethread_acq_rel + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_atomicrmw_singlethread_acq_rel + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") acq_rel 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: local_atomicrmw_singlethread_seq_cst +body: | + bb.0: + ; GFX7-LABEL: name: local_atomicrmw_singlethread_seq_cst + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX7: $m0 = S_MOV_B32 -1 + ; GFX7: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: local_atomicrmw_singlethread_seq_cst + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-WGP: $m0 = S_MOV_B32 -1 + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: local_atomicrmw_singlethread_seq_cst + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + ; GFX10-CU: $m0 = S_MOV_B32 -1 + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`, addrspace 3) + ; GFX10-CU: S_ENDPGM 0 + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 36, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 40, 0, 0 :: (dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4) + $m0 = S_MOV_B32 -1 + $vgpr1 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + $vgpr2 = DS_WRXCHG_RTN_B32 killed renamable $vgpr0, killed renamable $vgpr1, 0, 0, implicit $m0, implicit $exec :: (volatile store syncscope("singlethread-one-as") seq_cst 4 into `i32 addrspace(3)* undef`) + S_ENDPGM 0 + +... +--- + +name: multiple_mem_operands +body: | + ; GFX7-LABEL: name: multiple_mem_operands + ; GFX7: bb.0.entry: + ; GFX7: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GFX7: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX7: S_WAITCNT 127 + ; GFX7: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + ; GFX7: S_WAITCNT 3855 + ; GFX7: $vgpr0 = V_MOV_B32_e32 2, implicit $exec + ; GFX7: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX7: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX7: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX7: bb.1: + ; GFX7: successors: %bb.3(0x80000000) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: S_WAITCNT 3855 + ; GFX7: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + ; GFX7: S_BRANCH %bb.3 + ; GFX7: bb.2: + ; GFX7: successors: %bb.3(0x80000000) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: S_WAITCNT 3855 + ; GFX7: $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; GFX7: bb.3: + ; GFX7: S_WAITCNT 127 + ; GFX7: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + ; GFX7: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GFX7: S_WAITCNT 3952 + ; GFX7: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 1, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`, addrspace 1), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`, addrspace 5) + ; GFX7: S_WAITCNT 3952 + ; GFX7: BUFFER_WBINVL1_VOL implicit $exec + ; GFX7: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + ; GFX7: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + ; GFX7: S_WAITCNT 3952 + ; GFX7: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: multiple_mem_operands + ; GFX10-WGP: bb.0.entry: + ; GFX10-WGP: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GFX10-WGP: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-WGP: S_WAITCNT 127 + ; GFX10-WGP: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + ; GFX10-WGP: S_WAITCNT 3855 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 2, implicit $exec + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-WGP: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-WGP: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX10-WGP: bb.1: + ; GFX10-WGP: successors: %bb.3(0x80000000) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: S_WAITCNT 3855 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-WGP: S_BRANCH %bb.3 + ; GFX10-WGP: bb.2: + ; GFX10-WGP: successors: %bb.3(0x80000000) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: S_WAITCNT 3855 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; GFX10-WGP: bb.3: + ; GFX10-WGP: S_WAITCNT 127 + ; GFX10-WGP: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + ; GFX10-WGP: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GFX10-WGP: S_WAITCNT 16240 + ; GFX10-WGP: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10-WGP: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 1, 0, 0, 1, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`, addrspace 1), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`, addrspace 5) + ; GFX10-WGP: S_WAITCNT 16240 + ; GFX10-WGP: BUFFER_GL0_INV implicit $exec + ; GFX10-WGP: BUFFER_GL1_INV implicit $exec + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + ; GFX10-WGP: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + ; GFX10-WGP: S_WAITCNT 3952 + ; GFX10-WGP: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: multiple_mem_operands + ; GFX10-CU: bb.0.entry: + ; GFX10-CU: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GFX10-CU: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-CU: S_WAITCNT 127 + ; GFX10-CU: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + ; GFX10-CU: S_WAITCNT 3855 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 2, implicit $exec + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-CU: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-CU: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX10-CU: bb.1: + ; GFX10-CU: successors: %bb.3(0x80000000) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: S_WAITCNT 3855 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-CU: S_BRANCH %bb.3 + ; GFX10-CU: bb.2: + ; GFX10-CU: successors: %bb.3(0x80000000) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: S_WAITCNT 3855 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; GFX10-CU: bb.3: + ; GFX10-CU: S_WAITCNT 127 + ; GFX10-CU: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + ; GFX10-CU: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GFX10-CU: S_WAITCNT 16240 + ; GFX10-CU: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10-CU: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 1, 0, 0, 1, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`, addrspace 1), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`, addrspace 5) + ; GFX10-CU: S_WAITCNT 16240 + ; GFX10-CU: BUFFER_GL0_INV implicit $exec + ; GFX10-CU: BUFFER_GL1_INV implicit $exec + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + ; GFX10-CU: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + ; GFX10-CU: S_WAITCNT 3952 + ; GFX10-CU: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GFX10-CU: S_ENDPGM 0 + bb.0.entry: + successors: %bb.1(0x30000000), %bb.2(0x50000000) + liveins: $sgpr0_sgpr1, $sgpr3 + + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) + $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) + $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + S_WAITCNT 127 + S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + S_WAITCNT 3855 + $vgpr0 = V_MOV_B32_e32 2, implicit $exec + $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + S_CBRANCH_SCC0 %bb.1, implicit killed $scc + + bb.2: + successors: %bb.3(0x80000000) + liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 + + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) + S_WAITCNT 3855 + $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + S_BRANCH %bb.3 + + bb.1: + successors: %bb.3(0x80000000) + liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 + + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) + S_WAITCNT 3855 + $vgpr0 = V_MOV_B32_e32 4, implicit $exec + + bb.3: + liveins: $sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $sgpr0 + + S_WAITCNT 127 + $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) + $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + S_WAITCNT 3952 + FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`) + S_ENDPGM 0 + +... +--- + +name: multiple_mem_operands_nontemporal_1 +body: | + ; GFX7-LABEL: name: multiple_mem_operands_nontemporal_1 + ; GFX7: bb.0.entry: + ; GFX7: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GFX7: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX7: S_WAITCNT 127 + ; GFX7: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + ; GFX7: S_WAITCNT 3855 + ; GFX7: $vgpr0 = V_MOV_B32_e32 2, implicit $exec + ; GFX7: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX7: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX7: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX7: bb.1: + ; GFX7: successors: %bb.3(0x80000000) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: S_WAITCNT 3855 + ; GFX7: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + ; GFX7: S_BRANCH %bb.3 + ; GFX7: bb.2: + ; GFX7: successors: %bb.3(0x80000000) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: S_WAITCNT 3855 + ; GFX7: $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; GFX7: bb.3: + ; GFX7: S_WAITCNT 127 + ; GFX7: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + ; GFX7: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GFX7: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 1, 1, 0, 0, 0, implicit $exec :: (non-temporal load 4 from `i32 addrspace(1)* undef`, addrspace 1), (non-temporal load 4 from `[8192 x i32] addrspace(5)* undef`, addrspace 5) + ; GFX7: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + ; GFX7: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + ; GFX7: S_WAITCNT 3952 + ; GFX7: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: multiple_mem_operands_nontemporal_1 + ; GFX10-WGP: bb.0.entry: + ; GFX10-WGP: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GFX10-WGP: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-WGP: S_WAITCNT 127 + ; GFX10-WGP: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + ; GFX10-WGP: S_WAITCNT 3855 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 2, implicit $exec + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-WGP: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-WGP: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX10-WGP: bb.1: + ; GFX10-WGP: successors: %bb.3(0x80000000) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: S_WAITCNT 3855 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-WGP: S_BRANCH %bb.3 + ; GFX10-WGP: bb.2: + ; GFX10-WGP: successors: %bb.3(0x80000000) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: S_WAITCNT 3855 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; GFX10-WGP: bb.3: + ; GFX10-WGP: S_WAITCNT 127 + ; GFX10-WGP: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + ; GFX10-WGP: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GFX10-WGP: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 1, 0, 0, 0, implicit $exec :: (non-temporal load 4 from `i32 addrspace(1)* undef`, addrspace 1), (non-temporal load 4 from `[8192 x i32] addrspace(5)* undef`, addrspace 5) + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + ; GFX10-WGP: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + ; GFX10-WGP: S_WAITCNT 3952 + ; GFX10-WGP: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: multiple_mem_operands_nontemporal_1 + ; GFX10-CU: bb.0.entry: + ; GFX10-CU: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GFX10-CU: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-CU: S_WAITCNT 127 + ; GFX10-CU: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + ; GFX10-CU: S_WAITCNT 3855 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 2, implicit $exec + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-CU: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-CU: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX10-CU: bb.1: + ; GFX10-CU: successors: %bb.3(0x80000000) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: S_WAITCNT 3855 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-CU: S_BRANCH %bb.3 + ; GFX10-CU: bb.2: + ; GFX10-CU: successors: %bb.3(0x80000000) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: S_WAITCNT 3855 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; GFX10-CU: bb.3: + ; GFX10-CU: S_WAITCNT 127 + ; GFX10-CU: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + ; GFX10-CU: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GFX10-CU: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 1, 0, 0, 0, implicit $exec :: (non-temporal load 4 from `i32 addrspace(1)* undef`, addrspace 1), (non-temporal load 4 from `[8192 x i32] addrspace(5)* undef`, addrspace 5) + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + ; GFX10-CU: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + ; GFX10-CU: S_WAITCNT 3952 + ; GFX10-CU: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GFX10-CU: S_ENDPGM 0 + bb.0.entry: + successors: %bb.1(0x30000000), %bb.2(0x50000000) + liveins: $sgpr0_sgpr1, $sgpr3 + + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) + $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) + $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + S_WAITCNT 127 + S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + S_WAITCNT 3855 + $vgpr0 = V_MOV_B32_e32 2, implicit $exec + $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + S_CBRANCH_SCC0 %bb.1, implicit killed $scc + + bb.2: + successors: %bb.3(0x80000000) + liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 + + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) + S_WAITCNT 3855 + $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + S_BRANCH %bb.3 + + bb.1: + successors: %bb.3(0x80000000) + liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 + + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) + S_WAITCNT 3855 + $vgpr0 = V_MOV_B32_e32 4, implicit $exec + + bb.3: + liveins: $sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $sgpr0 + + S_WAITCNT 127 + $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from `i32 addrspace(1)* undef`), (non-temporal load 4 from `[8192 x i32] addrspace(5)* undef`) + $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + S_WAITCNT 3952 + FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`) + S_ENDPGM 0 + +... +--- + +name: multiple_mem_operands_nontemporal_2 +body: | + ; GFX7-LABEL: name: multiple_mem_operands_nontemporal_2 + ; GFX7: bb.0.entry: + ; GFX7: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; GFX7: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX7: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX7: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GFX7: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX7: S_WAITCNT 127 + ; GFX7: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + ; GFX7: S_WAITCNT 3855 + ; GFX7: $vgpr0 = V_MOV_B32_e32 2, implicit $exec + ; GFX7: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX7: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX7: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX7: bb.1: + ; GFX7: successors: %bb.3(0x80000000) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: S_WAITCNT 3855 + ; GFX7: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + ; GFX7: S_BRANCH %bb.3 + ; GFX7: bb.2: + ; GFX7: successors: %bb.3(0x80000000) + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: S_WAITCNT 3855 + ; GFX7: $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; GFX7: bb.3: + ; GFX7: S_WAITCNT 127 + ; GFX7: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + ; GFX7: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GFX7: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(1)* undef`, addrspace 1), (non-temporal load 4 from `[8192 x i32] addrspace(5)* undef`, addrspace 5) + ; GFX7: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + ; GFX7: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + ; GFX7: S_WAITCNT 3952 + ; GFX7: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: multiple_mem_operands_nontemporal_2 + ; GFX10-WGP: bb.0.entry: + ; GFX10-WGP: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; GFX10-WGP: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GFX10-WGP: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-WGP: S_WAITCNT 127 + ; GFX10-WGP: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + ; GFX10-WGP: S_WAITCNT 3855 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 2, implicit $exec + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-WGP: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-WGP: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX10-WGP: bb.1: + ; GFX10-WGP: successors: %bb.3(0x80000000) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: S_WAITCNT 3855 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-WGP: S_BRANCH %bb.3 + ; GFX10-WGP: bb.2: + ; GFX10-WGP: successors: %bb.3(0x80000000) + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: S_WAITCNT 3855 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; GFX10-WGP: bb.3: + ; GFX10-WGP: S_WAITCNT 127 + ; GFX10-WGP: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + ; GFX10-WGP: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GFX10-WGP: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(1)* undef`, addrspace 1), (non-temporal load 4 from `[8192 x i32] addrspace(5)* undef`, addrspace 5) + ; GFX10-WGP: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + ; GFX10-WGP: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + ; GFX10-WGP: S_WAITCNT 3952 + ; GFX10-WGP: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: multiple_mem_operands_nontemporal_2 + ; GFX10-CU: bb.0.entry: + ; GFX10-CU: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; GFX10-CU: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; GFX10-CU: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-CU: S_WAITCNT 127 + ; GFX10-CU: S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + ; GFX10-CU: S_WAITCNT 3855 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 2, implicit $exec + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-CU: BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`, addrspace 5) + ; GFX10-CU: S_CBRANCH_SCC0 %bb.2, implicit killed $scc + ; GFX10-CU: bb.1: + ; GFX10-CU: successors: %bb.3(0x80000000) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: S_WAITCNT 3855 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + ; GFX10-CU: S_BRANCH %bb.3 + ; GFX10-CU: bb.2: + ; GFX10-CU: successors: %bb.3(0x80000000) + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: S_WAITCNT 3855 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; GFX10-CU: bb.3: + ; GFX10-CU: S_WAITCNT 127 + ; GFX10-CU: $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + ; GFX10-CU: $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + ; GFX10-CU: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(1)* undef`, addrspace 1), (non-temporal load 4 from `[8192 x i32] addrspace(5)* undef`, addrspace 5) + ; GFX10-CU: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + ; GFX10-CU: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + ; GFX10-CU: S_WAITCNT 3952 + ; GFX10-CU: FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GFX10-CU: S_ENDPGM 0 + bb.0.entry: + successors: %bb.1(0x30000000), %bb.2(0x50000000) + liveins: $sgpr0_sgpr1, $sgpr3 + + $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) + $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) + $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 + $vgpr0 = V_MOV_B32_e32 1, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + S_WAITCNT 127 + S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc + S_WAITCNT 3855 + $vgpr0 = V_MOV_B32_e32 2, implicit $exec + $vgpr1 = V_MOV_B32_e32 32772, implicit $exec + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + S_CBRANCH_SCC0 %bb.1, implicit killed $scc + + bb.2: + successors: %bb.3(0x80000000) + liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 + + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) + S_WAITCNT 3855 + $vgpr0 = V_MOV_B32_e32 32772, implicit $exec + S_BRANCH %bb.3 + + bb.1: + successors: %bb.3(0x80000000) + liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11 + + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) + S_WAITCNT 3855 + $vgpr0 = V_MOV_B32_e32 4, implicit $exec + + bb.3: + liveins: $sgpr3, $sgpr4_sgpr5, $sgpr8_sgpr9_sgpr10_sgpr11, $vgpr0, $sgpr0 + + S_WAITCNT 127 + $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc + $vgpr0 = V_ADD_CO_U32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from `i32 addrspace(1)* undef`), (non-temporal load 4 from `[8192 x i32] addrspace(5)* undef`) + $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 + $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec + S_WAITCNT 3952 + FLAT_STORE_DWORD killed $vgpr1_vgpr2, killed $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4 into `i32 addrspace(1)* undef`) + S_ENDPGM 0 + +... +--- + +name: atomic_max_i32_noret +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '$sgpr0_sgpr1' } + - { reg: '$vgpr0' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + ; GFX7-LABEL: name: atomic_max_i32_noret + ; GFX7: bb.0: + ; GFX7: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX7: liveins: $vgpr0, $sgpr0_sgpr1 + ; GFX7: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 11, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX7: $vgpr1 = V_ASHRREV_I32_e32 31, $vgpr0, implicit $exec + ; GFX7: $vgpr1_vgpr2 = V_LSHL_B64 $vgpr0_vgpr1, 3, implicit $exec + ; GFX7: $sgpr7 = S_MOV_B32 61440 + ; GFX7: $sgpr6 = S_MOV_B32 0 + ; GFX7: S_WAITCNT 127 + ; GFX7: $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from `i32 addrspace(1)* addrspace(1)* undef`, addrspace 1) + ; GFX7: $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec + ; GFX7: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX7: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX7: $sgpr2_sgpr3 = S_XOR_B64 $exec, killed $sgpr2_sgpr3, implicit-def dead $scc + ; GFX7: SI_MASK_BRANCH %bb.2, implicit $exec + ; GFX7: bb.1: + ; GFX7: successors: %bb.2(0x80000000) + ; GFX7: liveins: $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000000C, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr1_vgpr2_vgpr3_vgpr4:0x0000000000000003 + ; GFX7: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 15, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX7: dead $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX7: dead $vgpr0 = V_MOV_B32_e32 61440, implicit $exec + ; GFX7: $sgpr4_sgpr5 = S_MOV_B64 0 + ; GFX7: S_WAITCNT 127 + ; GFX7: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX7: S_WAITCNT 3952 + ; GFX7: S_WAITCNT 3952 + ; GFX7: BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(1)* undef`, addrspace 1) + ; GFX7: S_WAITCNT 3952 + ; GFX7: BUFFER_WBINVL1_VOL implicit $exec + ; GFX7: bb.2: + ; GFX7: liveins: $sgpr2_sgpr3 + ; GFX7: $exec = S_OR_B64 $exec, killed $sgpr2_sgpr3, implicit-def $scc + ; GFX7: S_ENDPGM 0 + ; GFX10-WGP-LABEL: name: atomic_max_i32_noret + ; GFX10-WGP: bb.0: + ; GFX10-WGP: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-WGP: liveins: $vgpr0, $sgpr0_sgpr1 + ; GFX10-WGP: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 11, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: $vgpr1 = V_ASHRREV_I32_e32 31, $vgpr0, implicit $exec + ; GFX10-WGP: $vgpr1_vgpr2 = V_LSHL_B64 $vgpr0_vgpr1, 3, implicit $exec + ; GFX10-WGP: $sgpr7 = S_MOV_B32 61440 + ; GFX10-WGP: $sgpr6 = S_MOV_B32 0 + ; GFX10-WGP: S_WAITCNT 127 + ; GFX10-WGP: $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from `i32 addrspace(1)* addrspace(1)* undef`, addrspace 1) + ; GFX10-WGP: $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec + ; GFX10-WGP: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX10-WGP: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-WGP: $sgpr2_sgpr3 = S_XOR_B64 $exec, killed $sgpr2_sgpr3, implicit-def dead $scc + ; GFX10-WGP: SI_MASK_BRANCH %bb.2, implicit $exec + ; GFX10-WGP: bb.1: + ; GFX10-WGP: successors: %bb.2(0x80000000) + ; GFX10-WGP: liveins: $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000000C, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr1_vgpr2_vgpr3_vgpr4:0x0000000000000003 + ; GFX10-WGP: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 15, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-WGP: dead $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX10-WGP: dead $vgpr0 = V_MOV_B32_e32 61440, implicit $exec + ; GFX10-WGP: $sgpr4_sgpr5 = S_MOV_B64 0 + ; GFX10-WGP: S_WAITCNT 127 + ; GFX10-WGP: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-WGP: S_WAITCNT 3952 + ; GFX10-WGP: S_WAITCNT 16240 + ; GFX10-WGP: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10-WGP: BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(1)* undef`, addrspace 1) + ; GFX10-WGP: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10-WGP: BUFFER_GL0_INV implicit $exec + ; GFX10-WGP: BUFFER_GL1_INV implicit $exec + ; GFX10-WGP: bb.2: + ; GFX10-WGP: liveins: $sgpr2_sgpr3 + ; GFX10-WGP: $exec = S_OR_B64 $exec, killed $sgpr2_sgpr3, implicit-def $scc + ; GFX10-WGP: S_ENDPGM 0 + ; GFX10-CU-LABEL: name: atomic_max_i32_noret + ; GFX10-CU: bb.0: + ; GFX10-CU: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GFX10-CU: liveins: $vgpr0, $sgpr0_sgpr1 + ; GFX10-CU: $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 11, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: $vgpr1 = V_ASHRREV_I32_e32 31, $vgpr0, implicit $exec + ; GFX10-CU: $vgpr1_vgpr2 = V_LSHL_B64 $vgpr0_vgpr1, 3, implicit $exec + ; GFX10-CU: $sgpr7 = S_MOV_B32 61440 + ; GFX10-CU: $sgpr6 = S_MOV_B32 0 + ; GFX10-CU: S_WAITCNT 127 + ; GFX10-CU: $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from `i32 addrspace(1)* addrspace(1)* undef`, addrspace 1) + ; GFX10-CU: $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec + ; GFX10-CU: V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + ; GFX10-CU: $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX10-CU: $sgpr2_sgpr3 = S_XOR_B64 $exec, killed $sgpr2_sgpr3, implicit-def dead $scc + ; GFX10-CU: SI_MASK_BRANCH %bb.2, implicit $exec + ; GFX10-CU: bb.1: + ; GFX10-CU: successors: %bb.2(0x80000000) + ; GFX10-CU: liveins: $sgpr4_sgpr5_sgpr6_sgpr7:0x000000000000000C, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr1_vgpr2_vgpr3_vgpr4:0x0000000000000003 + ; GFX10-CU: $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 15, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; GFX10-CU: dead $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + ; GFX10-CU: dead $vgpr0 = V_MOV_B32_e32 61440, implicit $exec + ; GFX10-CU: $sgpr4_sgpr5 = S_MOV_B64 0 + ; GFX10-CU: S_WAITCNT 127 + ; GFX10-CU: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + ; GFX10-CU: S_WAITCNT 3952 + ; GFX10-CU: S_WAITCNT 16240 + ; GFX10-CU: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10-CU: BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(1)* undef`, addrspace 1) + ; GFX10-CU: S_WAITCNT_VSCNT undef $sgpr_null, 0 + ; GFX10-CU: BUFFER_GL0_INV implicit $exec + ; GFX10-CU: BUFFER_GL1_INV implicit $exec + ; GFX10-CU: bb.2: + ; GFX10-CU: liveins: $sgpr2_sgpr3 + ; GFX10-CU: $exec = S_OR_B64 $exec, killed $sgpr2_sgpr3, implicit-def $scc + ; GFX10-CU: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $vgpr0, $sgpr0_sgpr1 + + $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 11, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(4)* undef`) + $vgpr1 = V_ASHRREV_I32_e32 31, $vgpr0, implicit $exec + $vgpr1_vgpr2 = V_LSHL_B64 $vgpr0_vgpr1, 3, implicit $exec + $sgpr7 = S_MOV_B32 61440 + $sgpr6 = S_MOV_B32 0 + S_WAITCNT 127 + $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from `i32 addrspace(1)* addrspace(1)* undef`) + $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec + V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec + $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec + $sgpr2_sgpr3 = S_XOR_B64 $exec, killed $sgpr2_sgpr3, implicit-def dead $scc + SI_MASK_BRANCH %bb.2, implicit $exec + + bb.1: + successors: %bb.2(0x80000000) + liveins: $sgpr4_sgpr5_sgpr6_sgpr7:0x0000000C, $sgpr0_sgpr1, $sgpr2_sgpr3, $vgpr1_vgpr2_vgpr3_vgpr4:0x00000003 + + $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 15, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(4)* undef`) + dead $vgpr0 = V_MOV_B32_e32 -1, implicit $exec + dead $vgpr0 = V_MOV_B32_e32 61440, implicit $exec + $sgpr4_sgpr5 = S_MOV_B64 0 + S_WAITCNT 127 + $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec + S_WAITCNT 3952 + BUFFER_ATOMIC_SMAX_ADDR64 killed $vgpr0, killed $vgpr1_vgpr2, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 400, 0, implicit $exec :: (volatile load syncscope("one-as") seq_cst 4 from `i32 addrspace(1)* undef`) + + bb.2: + liveins: $sgpr2_sgpr3 + + $exec = S_OR_B64 $exec, killed $sgpr2_sgpr3, implicit-def $scc + S_ENDPGM 0 + +... +