Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11771,6 +11771,16 @@ SNaN, Depth); } +// Global FP atomic instructions have a hardcoded FP mode and do not support +// FP32 denormals, and only support v2f16 denormals. +static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) { + const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics(); + auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt); + if (&Flt == &APFloat::IEEEsingle()) + return DenormMode == DenormalMode::getPreserveSign(); + return DenormMode == DenormalMode::getIEEE(); +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { switch (RMW->getOperation()) { @@ -11789,10 +11799,15 @@ unsigned AS = RMW->getPointerAddressSpace(); if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) { + if (!fpModeMatchesGlobalFPAtomicMode(RMW)) + return AtomicExpansionKind::CmpXChg; + return RMW->use_empty() ? AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; } + // DS FP atomics do repect the denormal mode, but the rounding mode is fixed + // to round-to-nearest-even. return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; } Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -7,39 +7,55 @@ ; CAS: global_atomic_cmpswap ; CAS: s_andn2_b64 exec, exec, ; CAS-NEXT: s_cbranch_execnz [[LOOP]] -define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) { +define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { + %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst + store float %result, float addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32_ieee: +; CAS: [[LOOP:BB[0-9]+_[0-9]+]] +; CAS: v_add_f32_e32 +; CAS: global_atomic_cmpswap +; CAS: s_andn2_b64 exec, exec, +; CAS-NEXT: s_cbranch_execnz [[LOOP]] +define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %ptr) { %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void } ; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32: -; GFX900: [[LOOP:BB[0-9]+_[0-9]+]] -; GFX900: v_add_f32_e32 -; GFX900: global_atomic_cmpswap -; GFX900: s_andn2_b64 exec, exec, -; GFX900-NEXT: s_cbranch_execnz [[LOOP]] +; GCN: [[LOOP:BB[0-9]+_[0-9]+]] +; GCN: v_add_f32_e32 +; GCN: global_atomic_cmpswap +; GCN: s_andn2_b64 exec, exec, +; GCN-NEXT: s_cbranch_execnz [[LOOP]] +define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 { + %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst + ret void +} -; GFX908-NOT: v_add_f32 -; GFX908: global_atomic_add_f32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off -; GFX908-NOT: s_cbranch_execnz -define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) { +; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32_ieee: +; GCN: global_atomic_cmpswap +define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)* %ptr) { %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst ret void } ; Make sure this artificially selects with an incorrect subtarget, but the feature set. ; GCN-LABEL: {{^}}global_atomic_fadd_ret_f32_wrong_subtarget: -define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 { %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void } ; GCN-LABEL: {{^}}global_atomic_fadd_noret_f32_wrong_subtarget: -define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 { %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst ret void } -attributes #0 = { "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" } +attributes #0 = { "denormal-fp-math-fp32"="preserve-sign,preserve-sign"} +attributes #1 = { "denormal-fp-math-fp32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-insts" } Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -115,8 +115,8 @@ ret float %res } -define void @test_atomicrmw_fadd_f32_global_no_use(float addrspace(1)* %ptr, float %value) { -; CI-LABEL: @test_atomicrmw_fadd_f32_global_no_use( +define void @test_atomicrmw_fadd_f32_global_no_use_ieee(float addrspace(1)* %ptr, float %value) { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee( ; CI-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: @@ -133,7 +133,7 @@ ; CI: atomicrmw.end: ; CI-NEXT: ret void ; -; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_no_use( +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee( ; GFX9-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: @@ -150,7 +150,63 @@ ; GFX9: atomicrmw.end: ; GFX9-NEXT: ret void ; -; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use( +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee( +; GFX908-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; + %res = atomicrmw fadd float addrspace(1)* %ptr, float %value seq_cst + ret void +} + +define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(float addrspace(1)* %ptr, float %value) #0 { +; CI-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( +; CI-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; CI-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; CI-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; CI-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CI-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( +; GFX9-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX9-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX9-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX9-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( ; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst ; GFX908-NEXT: ret void ; @@ -407,3 +463,4 @@ ret double %res } +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }