Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1341,12 +1341,13 @@ } if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); + if (ST.hasFlatAtomicFaddF32Inst()) + Atomic.legalFor({{S32, FlatPtr}}); if (ST.hasGFX90AInsts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations // TODO: Move atomic expansion into legalizer - // TODO: Also supports <2 x f16> Atomic.legalFor({ {S32, GlobalPtr}, {S64, GlobalPtr}, Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4573,7 +4573,8 @@ case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: - case Intrinsic::amdgcn_ds_ordered_swap: { + case Intrinsic::amdgcn_ds_ordered_swap: + case Intrinsic::amdgcn_ds_fadd_v2bf16: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12785,9 +12785,6 @@ if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && Subtarget->hasAtomicFaddNoRtnInsts()) { - if (Subtarget->hasGFX940Insts()) - return AtomicExpansionKind::None; - // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe // floating point atomic instructions. May generate more efficient code, // but may not respect rounding and denormal modes, and may give incorrect @@ -12797,23 +12794,31 @@ .getValueAsString() != "true") return AtomicExpansionKind::CmpXChg; - if (Subtarget->hasGFX90AInsts()) { - if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) - return AtomicExpansionKind::CmpXChg; + // Always expand system scope fp atomics. + auto SSID = RMW->getSyncScopeID(); + if (SSID == SyncScope::System || + SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) + return AtomicExpansionKind::CmpXChg; - auto SSID = RMW->getSyncScopeID(); - if (SSID == SyncScope::System || - SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) - return AtomicExpansionKind::CmpXChg; + if (AS == AMDGPUAS::GLOBAL_ADDRESS && Ty->isFloatTy()) { + // global atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. + if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + // global atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. + if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } + // flat atomic fadd f32: gfx940, gfx11+. + if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() && + Subtarget->hasFlatAtomicFaddF32Inst()) return ReportUnsafeHWInst(AtomicExpansionKind::None); - } - if (AS == AMDGPUAS::FLAT_ADDRESS) - return AtomicExpansionKind::CmpXChg; + // global and flat atomic fadd f64: gfx90a, gfx940. + if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); - return RMW->use_empty() ? ReportUnsafeHWInst(AtomicExpansionKind::None) - : AtomicExpansionKind::CmpXChg; + return AtomicExpansionKind::CmpXChg; } // DS FP atomics do respect the denormal mode, but the rounding mode is Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll @@ -1,10 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=GFX940 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX940 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s -; GFX940: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s32) = G_ATOMICRMW_FADD - define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(float* %ptr, float %data) { + ; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX940-NEXT: S_ENDPGM 0 ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic ; GFX11: bb.1 (%ir-block.0): ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -20,6 +28,17 @@ } define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(float* %ptr, float %data) { + ; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic ; GFX11: bb.1 (%ir-block.0): ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 @@ -36,70 +55,52 @@ } define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(float* %ptr, float %data) #0 { + ; GFX940-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX940-NEXT: S_ENDPGM 0 ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): - ; GFX11-NEXT: successors: %bb.2(0x80000000) ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr) - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.2.atomicrmw.start: - ; GFX11-NEXT: successors: %bb.3(0x04000000), %bb.2(0x7c000000) - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI %10, %bb.2, [[S_MOV_B32_]], %bb.1 - ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.1, %8, %bb.2 - ; GFX11-NEXT: %7:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, [[PHI1]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic monotonic (s32) on %ir.2) - ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec - ; GFX11-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF_BREAK [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def $scc - ; GFX11-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX11-NEXT: S_BRANCH %bb.3 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.3.atomicrmw.end: - ; GFX11-NEXT: [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[SI_IF_BREAK]], %bb.2 - ; GFX11-NEXT: SI_END_CF [[PHI2]], implicit-def $exec, implicit-def $scc, implicit $exec + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret void } define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(float* %ptr, float %data) #0 { + ; GFX940-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw + ; GFX940: bb.1 (%ir-block.0): + ; GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX940-NEXT: {{ $}} + ; GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX940-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX940-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; GFX940-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw ; GFX11: bb.1 (%ir-block.0): - ; GFX11-NEXT: successors: %bb.2(0x80000000) ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr) - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.2.atomicrmw.start: - ; GFX11-NEXT: successors: %bb.3(0x04000000), %bb.2(0x7c000000) - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI %10, %bb.2, [[S_MOV_B32_]], %bb.1 - ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.1, %8, %bb.2 - ; GFX11-NEXT: %7:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE %7, %subreg.sub0, [[PHI1]], %subreg.sub1 - ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic monotonic (s32) on %ir.2) - ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec - ; GFX11-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF_BREAK [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def $scc - ; GFX11-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX11-NEXT: S_BRANCH %bb.3 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.3.atomicrmw.end: - ; GFX11-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.2 - ; GFX11-NEXT: [[PHI3:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[SI_IF_BREAK]], %bb.2 - ; GFX11-NEXT: SI_END_CF [[PHI3]], implicit-def $exec, implicit-def $scc, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[PHI2]] + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret float %ret Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data) @@ -24,6 +24,64 @@ ret void } +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) { +; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_endpgm + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 { +; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_endpgm + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret void +} + define float @flat_atomic_fadd_f32_rtn(float* %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_rtn: ; GFX940: ; %bb.0: @@ -35,6 +93,34 @@ ret float %ret } +define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret float %ret +} + define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(<2 x half>* %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -60,6 +146,56 @@ ret <2 x half> %ret } +define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(<2 x i16>* %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(<2 x i16>* %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3] +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @global_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -83,3 +219,36 @@ %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret <2 x half> %ret } + +define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_bf16 v1, v0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll @@ -4,7 +4,7 @@ declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.atomic.fadd) ; GFX90A-LABEL: {{^}}global_atomic_fadd_f32_rtn: ; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc Index: llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll +++ llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f32.ll @@ -72,7 +72,6 @@ ; GFX940-NEXT: S_ENDPGM 0 ; GFX11-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw ; GFX11: bb.0 (%ir-block.0): - ; GFX11-NEXT: successors: %bb.1(0x80000000) ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 @@ -80,27 +79,7 @@ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr) - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.1.atomicrmw.start: - ; GFX11-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.1 - ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %3, %bb.1 - ; GFX11-NEXT: %13:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed %13, %subreg.sub0, [[PHI1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic monotonic (s32) on %ir.2) - ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec - ; GFX11-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc - ; GFX11-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX11-NEXT: S_BRANCH %bb.2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.2.atomicrmw.end: - ; GFX11-NEXT: [[PHI2:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1 - ; GFX11-NEXT: SI_END_CF [[PHI2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec + ; GFX11-NEXT: FLAT_ATOMIC_ADD_F32 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) ; GFX11-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret void @@ -121,7 +100,6 @@ ; GFX940-NEXT: SI_RETURN_TO_EPILOG $vgpr0 ; GFX11-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw ; GFX11: bb.0 (%ir-block.0): - ; GFX11-NEXT: successors: %bb.1(0x80000000) ; GFX11-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2 @@ -129,29 +107,8 @@ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]] - ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr) - ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.1.atomicrmw.start: - ; GFX11-NEXT: successors: %bb.2(0x04000000), %bb.1(0x7c000000) - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %4, %bb.1 - ; GFX11-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %3, %bb.1 - ; GFX11-NEXT: %14:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[PHI1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed %14, %subreg.sub0, [[PHI1]], %subreg.sub1 - ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]] - ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic monotonic (s32) on %ir.2) - ; GFX11-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec - ; GFX11-NEXT: [[SI_IF_BREAK:%[0-9]+]]:sreg_32 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc - ; GFX11-NEXT: SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX11-NEXT: S_BRANCH %bb.2 - ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: bb.2.atomicrmw.end: - ; GFX11-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.1 - ; GFX11-NEXT: [[PHI3:%[0-9]+]]:sreg_32 = PHI [[SI_IF_BREAK]], %bb.1 - ; GFX11-NEXT: SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec - ; GFX11-NEXT: $vgpr0 = COPY [[PHI2]] + ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG $vgpr0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret float %ret Index: llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -28,14 +28,26 @@ ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst ret void @@ -45,14 +57,26 @@ ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst ret void @@ -73,12 +97,25 @@ ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst ret float %ret Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -3,7 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32: @@ -243,29 +243,16 @@ ; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst @@ -545,29 +532,16 @@ ; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off +; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst @@ -743,6 +717,32 @@ ; GCN-NEXT: s_or_b64 exec, exec, s[2:3] ; GCN-NEXT: global_store_dword v[0:1], v1, off ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_mov_b64 s[2:3], 0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v1, s4 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_wbinvl1_vol +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX11-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX11-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: global_store_dword v[0:1], v1, off +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -759,6 +759,17 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol ; GCN-NEXT: s_endpgm +; +; GFX11-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_wbinvl1_vol +; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -122,21 +122,8 @@ ; GFX940-NEXT: ret float [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_unsafe( -; GFX11-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 -; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* -; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX11: atomicrmw.end: -; GFX11-NEXT: ret float [[TMP6]] +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4 +; GFX11-NEXT: ret float [[RES]] ; %res = atomicrmw fadd float addrspace(1)* %ptr, float %value syncscope("wavefront") monotonic ret float %res @@ -297,21 +284,8 @@ ; GFX940-NEXT: ret float [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe( -; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[PTR:%.*]], align 4 -; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP2:%.*]] = bitcast float* [[PTR]] to i32* -; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX11: atomicrmw.end: -; GFX11-NEXT: ret float [[TMP6]] +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4 +; GFX11-NEXT: ret float [[RES]] ; %res = atomicrmw fadd float* %ptr, float %value syncscope("wavefront") monotonic ret float %res @@ -468,8 +442,21 @@ ; GFX90A-NEXT: ret float [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 -; GFX940-NEXT: ret float [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load float, float* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float* [[PTR]] to i32* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat( ; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[PTR:%.*]], align 4 @@ -562,8 +549,21 @@ ; GFX90A-NEXT: ret float [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 -; GFX940-NEXT: ret float [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global( ; GFX11-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 @@ -656,7 +656,20 @@ ; GFX90A-NEXT: ret void ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GFX940-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: ; GFX940-NEXT: ret void ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee( @@ -716,7 +729,20 @@ ; GFX9-NEXT: ret void ; ; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GFX908-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: ; GFX908-NEXT: ret void ; ; GFX90A-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( @@ -737,11 +763,37 @@ ; GFX90A-NEXT: ret void ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GFX940-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: ; GFX940-NEXT: ret void ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( -; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GFX11-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: ; GFX11-NEXT: ret void ; %res = atomicrmw fadd float addrspace(1)* %ptr, float %value seq_cst @@ -947,8 +999,21 @@ ; GFX90A-NEXT: ret double [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f64_flat( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd double* [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8 -; GFX940-NEXT: ret double [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load double, double* [[PTR:%.*]], align 8 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast double* [[PTR]] to i64* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret double [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat( ; GFX11-NEXT: [[TMP1:%.*]] = load double, double* [[PTR:%.*]], align 8 @@ -1041,8 +1106,21 @@ ; GFX90A-NEXT: ret double [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f64_global( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd double addrspace(1)* [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8 -; GFX940-NEXT: ret double [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load double, double addrspace(1)* [[PTR:%.*]], align 8 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret double [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f64_global( ; GFX11-NEXT: [[TMP1:%.*]] = load double, double addrspace(1)* [[PTR:%.*]], align 8 @@ -1216,8 +1294,21 @@ ; GFX90A-NEXT: ret float [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_agent( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] syncscope("agent") monotonic, align 4 -; GFX940-NEXT: ret float [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") monotonic monotonic, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_agent( ; GFX11-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 @@ -1310,8 +1401,21 @@ ; GFX90A-NEXT: ret float [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_one_as( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] syncscope("one-as") monotonic, align 4 -; GFX940-NEXT: ret float [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("one-as") monotonic monotonic, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_one_as( ; GFX11-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4