diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll --- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll @@ -1,17 +1,51 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1) -; GCN-LABEL: {{^}}lds_ds_fadd: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; GCN: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; GCN: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; GCN: s_waitcnt lgkmcnt(1) -; GCN: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { +; VI-LABEL: lds_ds_fadd: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s4, s3, 3 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; VI-NEXT: s_lshl_b32 s3, s3, 4 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: ds_add_f32 v2, v0 offset:64 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(1) +; VI-NEXT: ds_add_rtn_f32 v2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_ds_fadd: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: s_lshl_b32 s0, s3, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_endpgm %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -1,42 +1,335 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX678,HAS-ATOMICS %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,HAS-ATOMICS %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX678,NO-ATOMICS %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX678,NO-ATOMICS %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX6 %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s -; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f32: -; GFX678-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 4.0 -; HAS-ATOMICS: ds_add_rtn_f32 v0, v0, [[K]] - -; NO-ATOMICS: ds_read_b32 -; NO-ATOMICS: v_add_f32 -; NO-ATOMICS: ds_cmpst_rtn_b32 -; NO-ATOMICS: s_cbranch_execnz define float @lds_atomic_fadd_ret_f32(float addrspace(3)* %ptr) nounwind { +; GFX6-LABEL: lds_atomic_fadd_ret_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB0_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_ret_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB0_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_ret_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd float addrspace(3)* %ptr, float 4.0 seq_cst ret float %result } -; GCN-LABEL: {{^}}lds_atomic_fadd_noret_f32: -; GFX678-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 4.0 -; HAS-ATOMICS: ds_add_f32 v0, [[K]] define void @lds_atomic_fadd_noret_f32(float addrspace(3)* %ptr) nounwind { +; GFX6-LABEL: lds_atomic_fadd_noret_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b32 v1, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB1_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_noret_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v1, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v2, 4.0, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_noret_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_add_f32 v0, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_f32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd float addrspace(3)* %ptr, float 4.0 seq_cst ret void } -; GCN-LABEL: {{^}}lds_ds_fadd: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; HAS-ATOMICS: s_waitcnt lgkmcnt(0) -; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { +; GFX6-LABEL: lds_ds_fadd: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xc +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s3, 3 +; GFX6-NEXT: s_add_i32 s0, s0, 32 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_add_i32 s3, s3, 4 +; GFX6-NEXT: s_lshl_b32 s6, s3, 3 +; GFX6-NEXT: s_lshl_b32 s3, s3, 4 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_cbranch_execnz .LBB2_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: ds_read_b32 v1, v1 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: .LBB2_3: ; %atomicrmw.start2 +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_cbranch_execnz .LBB2_3 +; GFX6-NEXT: ; %bb.4: ; %atomicrmw.end1 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: ds_read_b32 v1, v1 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: .LBB2_5: ; %atomicrmw.start8 +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_cbranch_execnz .LBB2_5 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end7 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: lds_ds_fadd: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s4, s3, 3 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_read_b32 v0, v0 offset:32 +; GFX7-NEXT: s_add_i32 s3, s3, 4 +; GFX7-NEXT: s_lshl_b32 s6, s3, 3 +; GFX7-NEXT: s_lshl_b32 s3, s3, 4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB2_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB2_3: ; %atomicrmw.start2 +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB2_3 +; GFX7-NEXT: ; %bb.4: ; %atomicrmw.end1 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB2_5: ; %atomicrmw.start8 +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB2_5 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end7 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: lds_ds_fadd: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b32 s4, s3, 3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b32 s3, s3, 4 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_add_rtn_f32 v2, v0, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: lds_ds_fadd: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s3, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_endpgm %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 @@ -49,15 +342,182 @@ ret void } -; GCN-LABEL: {{^}}lds_ds_fadd_one_as: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; HAS-ATOMICS: s_waitcnt lgkmcnt(1) -; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd_one_as(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { +; GFX6-LABEL: lds_ds_fadd_one_as: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_load_dword s3, s[0:1], 0xc +; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: s_lshl_b32 s0, s3, 3 +; GFX6-NEXT: s_add_i32 s0, s0, 32 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 +; GFX6-NEXT: ds_read_b32 v0, v0 +; GFX6-NEXT: s_add_i32 s3, s3, 4 +; GFX6-NEXT: s_lshl_b32 s6, s3, 3 +; GFX6-NEXT: s_lshl_b32 s3, s3, 4 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v1, v0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 +; GFX6-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_cbranch_execnz .LBB3_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v1, s3 +; GFX6-NEXT: ds_read_b32 v1, v1 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: .LBB3_3: ; %atomicrmw.start2 +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, s3 +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v1, v2 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_cbranch_execnz .LBB3_3 +; GFX6-NEXT: ; %bb.4: ; %atomicrmw.end1 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: ds_read_b32 v1, v1 +; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: .LBB3_5: ; %atomicrmw.start8 +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v2, v1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX6-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_cbranch_execnz .LBB3_5 +; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end7 +; GFX6-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX6-NEXT: s_mov_b32 s7, 0xf000 +; GFX6-NEXT: s_mov_b32 s6, -1 +; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0 +; GFX6-NEXT: s_endpgm +; +; GFX7-LABEL: lds_ds_fadd_one_as: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s4, s3, 3 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: ds_read_b32 v0, v0 offset:32 +; GFX7-NEXT: s_add_i32 s3, s3, 4 +; GFX7-NEXT: s_lshl_b32 s6, s3, 3 +; GFX7-NEXT: s_lshl_b32 s3, s3, 4 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX7-NEXT: ds_cmpst_rtn_b32 v0, v0, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB3_3: ; %atomicrmw.start2 +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f32_e32 v2, 0x42280000, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v3, v1, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, v2 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB3_3 +; GFX7-NEXT: ; %bb.4: ; %atomicrmw.end1 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: ds_read_b32 v1, v1 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB3_5: ; %atomicrmw.start8 +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: v_add_f32_e32 v3, v2, v0 +; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v1, v2, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB3_5 +; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end7 +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: buffer_store_dword v1, off, s[0:3], 0 +; GFX7-NEXT: s_endpgm +; +; GFX8-LABEL: lds_ds_fadd_one_as: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_lshl_b32 s4, s3, 3 +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX8-NEXT: s_lshl_b32 s3, s3, 4 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_waitcnt lgkmcnt(1) +; GFX8-NEXT: ds_add_rtn_f32 v2, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: lds_ds_fadd_one_as: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: s_lshl_b32 s0, s3, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_endpgm %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 @@ -70,61 +530,580 @@ ret void } -; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 -; GCN: ds_cmpst_rtn_b64 -; GCN: s_cbranch_execnz define double @lds_atomic_fadd_ret_f64(double addrspace(3)* %ptr) nounwind { +; GFX6-LABEL: lds_atomic_fadd_ret_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: v_mov_b32_e32 v2, v0 +; GFX6-NEXT: ds_read_b64 v[0:1], v0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v4, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, v0 +; GFX6-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB4_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_ret_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: ds_read_b64 v[0:1], v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v4, v1 +; GFX7-NEXT: v_mov_b32_e32 v3, v0 +; GFX7-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB4_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_ret_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: ds_read_b64 v[0:1], v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v4, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v0 +; GFX8-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB4_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst ret double %result } -; GCN-LABEL: {{^}}lds_atomic_fadd_noret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 -; GCN: ds_cmpst_rtn_b64 -; GCN: s_cbranch_execnz define void @lds_atomic_fadd_noret_f64(double addrspace(3)* %ptr) nounwind { +; GFX6-LABEL: lds_atomic_fadd_noret_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[1:2], v0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX6-NEXT: v_mov_b32_e32 v1, v3 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v2, v4 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB5_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fadd_noret_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[1:2], v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX7-NEXT: v_mov_b32_e32 v1, v3 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fadd_noret_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b64 v[1:2], v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[1:2], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst ret void } -; GCN-LABEL: {{^}}lds_atomic_fsub_ret_f32: -; GCN: ds_read_b32 -; GCN: v_sub_f32 -; GCN: ds_cmpst_rtn_b32 -; GCN: s_cbranch_execnz define float @lds_atomic_fsub_ret_f32(float addrspace(3)* %ptr, float %val) nounwind { +; GFX6-LABEL: lds_atomic_fsub_ret_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB6_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fsub_ret_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v3, v2 +; GFX7-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB6_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fsub_ret_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, v2 +; GFX8-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB6_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_ret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub float addrspace(3)* %ptr, float %val seq_cst ret float %result } -; GCN-LABEL: {{^}}lds_atomic_fsub_noret_f32: -; GCN: ds_read_b32 -; GCN: v_sub_f32 -; GCN: ds_cmpst_rtn_b32 define void @lds_atomic_fsub_noret_f32(float addrspace(3)* %ptr, float %val) nounwind { +; GFX6-LABEL: lds_atomic_fsub_noret_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b32 v2, v0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v2, v3 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB7_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fsub_noret_f32: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b32 v2, v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v2, v3 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB7_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fsub_noret_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b32 v2, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v2, v3 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB7_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_noret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub float addrspace(3)* %ptr, float %val seq_cst ret void } -; GCN-LABEL: {{^}}lds_atomic_fsub_ret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} -; GCN: ds_cmpst_rtn_b64 - define double @lds_atomic_fsub_ret_f64(double addrspace(3)* %ptr, double %val) nounwind { +; GFX6-LABEL: lds_atomic_fsub_ret_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[3:4], v0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, v4 +; GFX6-NEXT: v_mov_b32_e32 v5, v3 +; GFX6-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB8_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v0, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, v4 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fsub_ret_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[3:4], v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_mov_b32_e32 v5, v3 +; GFX7-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB8_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v3 +; GFX7-NEXT: v_mov_b32_e32 v1, v4 +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fsub_ret_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b64 v[3:4], v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_mov_b32_e32 v5, v3 +; GFX8-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB8_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v3 +; GFX8-NEXT: v_mov_b32_e32 v1, v4 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_ret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[3:4], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub double addrspace(3)* %ptr, double %val seq_cst ret double %result } -; GCN-LABEL: {{^}}lds_atomic_fsub_noret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} -; GCN: ds_cmpst_rtn_b64 -; GCN: s_cbranch_execnz define void @lds_atomic_fsub_noret_f64(double addrspace(3)* %ptr, double %val) nounwind { +; GFX6-LABEL: lds_atomic_fsub_noret_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 m0, -1 +; GFX6-NEXT: ds_read_b64 v[3:4], v0 +; GFX6-NEXT: s_mov_b64 s[4:5], 0 +; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; GFX6-NEXT: s_waitcnt lgkmcnt(0) +; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; GFX6-NEXT: v_mov_b32_e32 v3, v5 +; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX6-NEXT: v_mov_b32_e32 v4, v6 +; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_cbranch_execnz .LBB9_1 +; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX6-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-LABEL: lds_atomic_fsub_noret_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_read_b64 v[3:4], v0 +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v3, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v4, v6 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB9_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: lds_atomic_fsub_noret_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: ds_read_b64 v[3:4], v0 +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v3, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v4, v6 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB9_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_noret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[3:4], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub double addrspace(3)* %ptr, double %val seq_cst ret void }