diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll --- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll @@ -1,17 +1,51 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1) -; GCN-LABEL: {{^}}lds_ds_fadd: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; GCN: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; GCN: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; GCN: s_waitcnt lgkmcnt(1) -; GCN: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { +; VI-LABEL: lds_ds_fadd: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s4, s3, 3 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; VI-NEXT: s_lshl_b32 s3, s3, 4 +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: ds_add_f32 v2, v0 offset:64 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_waitcnt lgkmcnt(1) +; VI-NEXT: ds_add_rtn_f32 v2, v0, v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_ds_fadd: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: s_lshl_b32 s0, s3, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_endpgm %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -1,42 +1,59 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX678,HAS-ATOMICS %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,HAS-ATOMICS %s -; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX678,NO-ATOMICS %s -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX678,NO-ATOMICS %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX678 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX678 %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX678 %s -; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f32: -; GFX678-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 4.0 -; HAS-ATOMICS: ds_add_rtn_f32 v0, v0, [[K]] - -; NO-ATOMICS: ds_read_b32 -; NO-ATOMICS: v_add_f32 -; NO-ATOMICS: ds_cmpst_rtn_b32 -; NO-ATOMICS: s_cbranch_execnz define float @lds_atomic_fadd_ret_f32(float addrspace(3)* %ptr) nounwind { +; GFX9-LABEL: lds_atomic_fadd_ret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd float addrspace(3)* %ptr, float 4.0 seq_cst ret float %result } -; GCN-LABEL: {{^}}lds_atomic_fadd_noret_f32: -; GFX678-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 4.0 -; HAS-ATOMICS: ds_add_f32 v0, [[K]] define void @lds_atomic_fadd_noret_f32(float addrspace(3)* %ptr) nounwind { +; GFX9-LABEL: lds_atomic_fadd_noret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_f32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd float addrspace(3)* %ptr, float 4.0 seq_cst ret void } -; GCN-LABEL: {{^}}lds_ds_fadd: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; HAS-ATOMICS: s_waitcnt lgkmcnt(0) -; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { +; GFX9-LABEL: lds_ds_fadd: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s3, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_endpgm %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 @@ -49,15 +66,26 @@ ret void } -; GCN-LABEL: {{^}}lds_ds_fadd_one_as: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; HAS-ATOMICS: s_waitcnt lgkmcnt(1) -; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd_one_as(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { +; GFX9-LABEL: lds_ds_fadd_one_as: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s3, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_add_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: s_lshl_b32 s0, s3, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_add_f32 v2, v0 offset:64 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v1, v0, s[4:5] +; GFX9-NEXT: s_endpgm %idx.add = add nuw i32 %idx, 4 %shl0 = shl i32 %idx.add, 3 %shl1 = shl i32 %idx.add, 4 @@ -70,61 +98,300 @@ ret void } -; GCN-LABEL: {{^}}lds_atomic_fadd_ret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 -; GCN: ds_cmpst_rtn_b64 -; GCN: s_cbranch_execnz define double @lds_atomic_fadd_ret_f64(double addrspace(3)* %ptr) nounwind { +; GFX678-LABEL: lds_atomic_fadd_ret_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_mov_b32 m0, -1 +; GFX678-NEXT: v_mov_b32_e32 v2, v0 +; GFX678-NEXT: ds_read_b64 v[0:1], v0 +; GFX678-NEXT: s_mov_b64 s[4:5], 0 +; GFX678-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX678-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v4, v1 +; GFX678-NEXT: v_mov_b32_e32 v3, v0 +; GFX678-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX678-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX678-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX678-NEXT: s_cbranch_execnz .LBB4_1 +; GFX678-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX678-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_ret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: ds_read_b64 v[0:1], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB4_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst ret double %result } -; GCN-LABEL: {{^}}lds_atomic_fadd_noret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 -; GCN: ds_cmpst_rtn_b64 -; GCN: s_cbranch_execnz define void @lds_atomic_fadd_noret_f64(double addrspace(3)* %ptr) nounwind { +; GFX678-LABEL: lds_atomic_fadd_noret_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_mov_b32 m0, -1 +; GFX678-NEXT: ds_read_b64 v[1:2], v0 +; GFX678-NEXT: s_mov_b64 s[4:5], 0 +; GFX678-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX678-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX678-NEXT: v_mov_b32_e32 v1, v3 +; GFX678-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX678-NEXT: v_mov_b32_e32 v2, v4 +; GFX678-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX678-NEXT: s_cbranch_execnz .LBB5_1 +; GFX678-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX678-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fadd_noret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[1:2], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], 4.0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst ret void } -; GCN-LABEL: {{^}}lds_atomic_fsub_ret_f32: -; GCN: ds_read_b32 -; GCN: v_sub_f32 -; GCN: ds_cmpst_rtn_b32 -; GCN: s_cbranch_execnz define float @lds_atomic_fsub_ret_f32(float addrspace(3)* %ptr, float %val) nounwind { +; GFX678-LABEL: lds_atomic_fsub_ret_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_mov_b32 m0, -1 +; GFX678-NEXT: ds_read_b32 v2, v0 +; GFX678-NEXT: s_mov_b64 s[4:5], 0 +; GFX678-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX678-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v3, v2 +; GFX678-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX678-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX678-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX678-NEXT: s_cbranch_execnz .LBB6_1 +; GFX678-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX678-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX678-NEXT: v_mov_b32_e32 v0, v2 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_ret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_sub_f32_e32 v2, v3, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b32 v2, v0, v3, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub float addrspace(3)* %ptr, float %val seq_cst ret float %result } -; GCN-LABEL: {{^}}lds_atomic_fsub_noret_f32: -; GCN: ds_read_b32 -; GCN: v_sub_f32 -; GCN: ds_cmpst_rtn_b32 define void @lds_atomic_fsub_noret_f32(float addrspace(3)* %ptr, float %val) nounwind { +; GFX678-LABEL: lds_atomic_fsub_noret_f32: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_mov_b32 m0, -1 +; GFX678-NEXT: ds_read_b32 v2, v0 +; GFX678-NEXT: s_mov_b64 s[4:5], 0 +; GFX678-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX678-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX678-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX678-NEXT: v_mov_b32_e32 v2, v3 +; GFX678-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX678-NEXT: s_cbranch_execnz .LBB7_1 +; GFX678-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX678-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_noret_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b32 v2, v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_sub_f32_e32 v3, v2, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b32 v3, v0, v2, v3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB7_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub float addrspace(3)* %ptr, float %val seq_cst ret void } -; GCN-LABEL: {{^}}lds_atomic_fsub_ret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} -; GCN: ds_cmpst_rtn_b64 - define double @lds_atomic_fsub_ret_f64(double addrspace(3)* %ptr, double %val) nounwind { +; GFX678-LABEL: lds_atomic_fsub_ret_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_mov_b32 m0, -1 +; GFX678-NEXT: ds_read_b64 v[3:4], v0 +; GFX678-NEXT: s_mov_b64 s[4:5], 0 +; GFX678-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX678-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_mov_b32_e32 v6, v4 +; GFX678-NEXT: v_mov_b32_e32 v5, v3 +; GFX678-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX678-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX678-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX678-NEXT: s_cbranch_execnz .LBB8_1 +; GFX678-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX678-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX678-NEXT: v_mov_b32_e32 v0, v3 +; GFX678-NEXT: v_mov_b32_e32 v1, v4 +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_ret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[3:4], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_add_f64 v[3:4], v[5:6], -v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB8_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub double addrspace(3)* %ptr, double %val seq_cst ret double %result } -; GCN-LABEL: {{^}}lds_atomic_fsub_noret_f64: -; GCN: ds_read_b64 -; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}} -; GCN: ds_cmpst_rtn_b64 -; GCN: s_cbranch_execnz define void @lds_atomic_fsub_noret_f64(double addrspace(3)* %ptr, double %val) nounwind { +; GFX678-LABEL: lds_atomic_fsub_noret_f64: +; GFX678: ; %bb.0: +; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX678-NEXT: s_mov_b32 m0, -1 +; GFX678-NEXT: ds_read_b64 v[3:4], v0 +; GFX678-NEXT: s_mov_b64 s[4:5], 0 +; GFX678-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX678-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; GFX678-NEXT: s_waitcnt lgkmcnt(0) +; GFX678-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; GFX678-NEXT: v_mov_b32_e32 v3, v5 +; GFX678-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX678-NEXT: v_mov_b32_e32 v4, v6 +; GFX678-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX678-NEXT: s_cbranch_execnz .LBB9_1 +; GFX678-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX678-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX678-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: lds_atomic_fsub_noret_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: ds_read_b64 v[3:4], v0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_f64 v[5:6], v[3:4], -v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4] +; GFX9-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v6 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fsub double addrspace(3)* %ptr, double %val seq_cst ret void }