diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst --- a/clang/docs/ClangCommandLineReference.rst +++ b/clang/docs/ClangCommandLineReference.rst @@ -2997,6 +2997,10 @@ Specify XNACK mode (AMDGPU only) +.. option:: -munsafe-fp-atomics, -mno-unsafe-fp-atomics + +Enable generation of unsafe floating point atomic instructions. May generate more efficient code, but may not respect rounding and denormal modes, and may give incorrect results for certain memory destinations. (AMDGPU only) + ARM --- .. option:: -faapcs-bitfield-load diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12055,9 +12055,14 @@ if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && Subtarget->hasAtomicFaddInsts()) { - if (!fpModeMatchesGlobalFPAtomicMode(RMW) || - RMW->getFunction()->getFnAttribute("amdgpu-unsafe-fp-atomics") - .getValueAsString() != "true") + // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe + // floating point atomic instructions. May generate more efficient code, + // but may not respect rounding and denormal modes, and may give incorrect + // results for certain memory destinations. + if (!fpModeMatchesGlobalFPAtomicMode(RMW) && + RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsString() != "true") return AtomicExpansionKind::CmpXChg; if (Subtarget->hasGFX90AInsts()) { diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -490,25 +490,13 @@ ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1] -; GFX90A-NEXT: BB27_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc +; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz BB27_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm main_body: %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -171,26 +171,12 @@ ; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: BB1_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz BB1_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; @@ -330,49 +316,23 @@ ; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 -; GFX908-NEXT: BB3_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 -; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz BB3_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: BB3_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz BB3_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee: