diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX90A +; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg) declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg) @@ -26,6 +27,17 @@ ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: raw_buffer_atomic_add_noret_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -38,6 +50,13 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -59,6 +78,21 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -76,6 +110,17 @@ ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: struct_buffer_atomic_add_noret_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -88,6 +133,13 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -109,6 +161,21 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -126,6 +193,17 @@ ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: raw_buffer_atomic_min_noret_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -138,6 +216,13 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -159,6 +244,21 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -176,6 +276,17 @@ ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: struct_buffer_atomic_min_noret_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -188,6 +299,13 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -209,6 +327,21 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -226,6 +359,17 @@ ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: raw_buffer_atomic_max_noret_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) ret void @@ -238,6 +382,13 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0) store double %ret, ptr undef @@ -259,6 +410,21 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -276,6 +442,17 @@ ; GFX90A-NEXT: v_mov_b32_e32 v2, s8 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: struct_buffer_atomic_max_noret_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s8 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) ret void @@ -288,6 +465,13 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) store double %ret, ptr undef @@ -309,6 +493,21 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c +; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s10 +; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2) store double %ret, ptr addrspace(1) %out, align 8 @@ -324,6 +523,16 @@ ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_atomic_fadd_f64_noret: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void @@ -338,6 +547,16 @@ ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_atomic_fmin_f64_noret: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void @@ -352,6 +571,16 @@ ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_atomic_fmax_f64_noret: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: v_mov_b32_e32 v1, s3 +; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret void @@ -383,6 +612,18 @@ ; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_atomic_fadd_f64_noret_pat: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst ret void @@ -400,6 +641,18 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst ret void @@ -431,6 +684,18 @@ ; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst ret void @@ -448,6 +713,18 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst ret void @@ -460,6 +737,13 @@ ; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_atomic_fadd_f64_rtn: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret @@ -491,6 +775,17 @@ ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst ret double %ret @@ -507,6 +802,17 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_agent: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst ret double %ret @@ -538,6 +844,17 @@ ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_system: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst ret double %ret @@ -550,6 +867,13 @@ ; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_atomic_fmax_f64_rtn: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret @@ -562,6 +886,13 @@ ; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: global_atomic_fmin_f64_rtn: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data) ret double %ret @@ -591,6 +922,18 @@ ; GFX90A-NEXT: s_cbranch_execnz .LBB31_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst ret void @@ -622,6 +965,19 @@ ; GFX90A-NEXT: s_cbranch_execnz .LBB32_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst ret void @@ -640,6 +996,19 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst ret void @@ -672,6 +1041,19 @@ ; GFX90A-NEXT: s_cbranch_execnz .LBB34_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst ret void @@ -703,6 +1085,17 @@ ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst ret double %ret @@ -719,6 +1112,17 @@ ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_agent: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst ret double %ret @@ -751,6 +1155,18 @@ ; GFX90A-NEXT: v_mov_b32_e32 v0, v2 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_system: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst ret double %ret @@ -765,6 +1181,17 @@ ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: flat_atomic_fadd_f64_noret: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, s3 +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) ret void @@ -777,6 +1204,13 @@ ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fadd_f64_rtn: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data) ret double %ret @@ -806,6 +1240,19 @@ ; GFX90A-NEXT: s_cbranch_execnz .LBB40_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc1 +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst ret void @@ -820,6 +1267,17 @@ ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: flat_atomic_fmin_f64_noret: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, s3 +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) ret void @@ -832,6 +1290,13 @@ ; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmin_f64_rtn: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data) ret double %ret @@ -846,6 +1311,17 @@ ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: flat_atomic_fmax_f64_noret: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s0 +; GFX940-NEXT: v_mov_b32_e32 v1, s1 +; GFX940-NEXT: v_mov_b32_e32 v2, s2 +; GFX940-NEXT: v_mov_b32_e32 v3, s3 +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) ret void @@ -858,6 +1334,13 @@ ; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: flat_atomic_fmax_f64_rtn: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data) ret double %ret @@ -873,6 +1356,16 @@ ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: local_atomic_fadd_f64_noret: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24 +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: ds_add_f64 v2, v[0:1] +; GFX940-NEXT: s_endpgm main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret void @@ -887,6 +1380,15 @@ ; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: local_atomic_fadd_f64_rtn: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret @@ -904,6 +1406,17 @@ ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: local_atomic_fadd_f64_noret_pat: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: ds_add_f64 v2, v[0:1] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -921,6 +1434,17 @@ ; GFX90A-NEXT: ds_add_f64 v2, v[0:1] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: ds_add_f64 v2, v[0:1] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -949,6 +1473,29 @@ ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm +; +; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24 +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s2 +; GFX940-NEXT: ds_read_b64 v[0:1], v0 +; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], 4.0 +; GFX940-NEXT: v_mov_b32_e32 v4, s2 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1] +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[2:3] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB49_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_endpgm main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret void @@ -964,6 +1511,15 @@ ; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: local_atomic_fadd_f64_rtn_pat: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst ret double %ret @@ -978,6 +1534,15 @@ ; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret @@ -992,6 +1557,15 @@ ; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[4:5] ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: +; GFX940: ; %bb.0: ; %main_body +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] main_body: %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0) ret double %ret