Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12149,8 +12149,15 @@ // to round-to-nearest-even. // The only exception is DS_ADD_F64 which never flushes regardless of mode. if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) { - return (Ty->isDoubleTy() && !fpModeMatchesGlobalFPAtomicMode(RMW)) ? - AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; + if (!Ty->isDoubleTy()) + return AtomicExpansionKind::None; + + return (fpModeMatchesGlobalFPAtomicMode(RMW) || + RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsString() == "true") + ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; } return AtomicExpansionKind::CmpXChg; Index: llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -899,11 +899,28 @@ ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v2, s0 +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: ds_add_f64 v2, v[0:1] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_endpgm +main_body: + %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(double addrspace(3)* %ptr) #4 { +; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_mov_b32_e32 v0, s0 ; GFX90A-NEXT: ds_read_b64 v[0:1], v0 -; GFX90A-NEXT: BB49_1: ; %atomicrmw.start +; GFX90A-NEXT: BB50_1: ; %atomicrmw.start ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0 @@ -915,7 +932,7 @@ ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1] ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz BB49_1 +; GFX90A-NEXT: s_cbranch_execnz BB50_1 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX90A-NEXT: s_endpgm main_body: @@ -938,5 +955,36 @@ ret double %ret } +define double @local_atomic_fadd_f64_rtn_ieee_unsafe(double addrspace(3)* %ptr, double %data) #2 { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0) + ret double %ret +} + +define double @local_atomic_fadd_f64_rtn_ieee_safe(double addrspace(3)* %ptr, double %data) #3 { +; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe: +; GFX90A: ; %bb.0: ; %main_body +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v3, v2 +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3] +; GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +main_body: + %ret = call double @llvm.amdgcn.ds.fadd.f64(double addrspace(3)* %ptr, double %data, i32 0, i32 0, i1 0) + ret double %ret +} + attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" } attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" } +attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" } +attributes #3 = { "denormal-fp-math"="ieee,ieee" } +attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" }