Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1341,12 +1341,13 @@ } if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); + if (ST.hasGFX940Insts() || ST.getGeneration() >= AMDGPUSubtarget::GFX11) + Atomic.legalFor({{S32, FlatPtr}}); if (ST.hasGFX90AInsts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations // TODO: Move atomic expansion into legalizer - // TODO: Also supports <2 x f16> Atomic.legalFor({ {S32, GlobalPtr}, {S64, GlobalPtr}, Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4573,7 +4573,8 @@ case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: - case Intrinsic::amdgcn_ds_ordered_swap: { + case Intrinsic::amdgcn_ds_ordered_swap: + case Intrinsic::amdgcn_ds_fadd_v2bf16: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12783,11 +12783,7 @@ if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy())) return AtomicExpansionKind::CmpXChg; - if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && - Subtarget->hasAtomicFaddNoRtnInsts()) { - if (Subtarget->hasGFX940Insts()) - return AtomicExpansionKind::None; - + if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) { // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe // floating point atomic instructions. May generate more efficient code, // but may not respect rounding and denormal modes, and may give incorrect @@ -12797,23 +12793,32 @@ .getValueAsString() != "true") return AtomicExpansionKind::CmpXChg; - if (Subtarget->hasGFX90AInsts()) { - if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) - return AtomicExpansionKind::CmpXChg; + // Always expand system scope fp atomics. + auto SSID = RMW->getSyncScopeID(); + if (SSID == SyncScope::System || + SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) + return AtomicExpansionKind::CmpXChg; - auto SSID = RMW->getSyncScopeID(); - if (SSID == SyncScope::System || - SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) - return AtomicExpansionKind::CmpXChg; + if (AS == AMDGPUAS::GLOBAL_ADDRESS && Ty->isFloatTy()) { + // global atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. + if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + // global atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. + if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); + } + // flat atomic fadd f32: gfx940, gfx11+. + if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy() && + (Subtarget->hasGFX940Insts() || + Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)) return ReportUnsafeHWInst(AtomicExpansionKind::None); - } - if (AS == AMDGPUAS::FLAT_ADDRESS) - return AtomicExpansionKind::CmpXChg; + // global and flat atomic fadd f64: gfx90a, gfx940. + if (Ty->isDoubleTy() && Subtarget->hasGFX90AInsts()) + return ReportUnsafeHWInst(AtomicExpansionKind::None); - return RMW->use_empty() ? ReportUnsafeHWInst(AtomicExpansionKind::None) - : AtomicExpansionKind::CmpXChg; + return AtomicExpansionKind::CmpXChg; } // DS FP atomics do respect the denormal mode, but the rounding mode is Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd.f32.ll @@ -1,26 +1,67 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=MI300 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI300 %s ; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=GFX11 %s -; MI300: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s32) = G_ATOMICRMW_FADD ; GFX11: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.flat.atomic.fadd) define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(float* %ptr, float %data) { + ; MI300-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic + ; MI300: bb.1 (%ir-block.0): + ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI300-NEXT: {{ $}} + ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; MI300-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret void } define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(float* %ptr, float %data) { + ; MI300-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic + ; MI300: bb.1 (%ir-block.0): + ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI300-NEXT: {{ $}} + ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret float %ret } define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(float* %ptr, float %data) #0 { + ; MI300-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw + ; MI300: bb.1 (%ir-block.0): + ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI300-NEXT: {{ $}} + ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; MI300-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret void } define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(float* %ptr, float %data) #0 { + ; MI300-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw + ; MI300: bb.1 (%ir-block.0): + ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI300-NEXT: {{ $}} + ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data) @@ -24,6 +24,64 @@ ret void } +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) { +; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_endpgm + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 { +; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_endpgm + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret void +} + define float @flat_atomic_fadd_f32_rtn(float* %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_rtn: ; GFX940: ; %bb.0: @@ -35,6 +93,34 @@ ret float %ret } +define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret float %ret +} + define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(<2 x half>* %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -60,6 +146,56 @@ ret <2 x half> %ret } +define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(<2 x i16>* %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(<2 x i16>* %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3] +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @global_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -83,3 +219,36 @@ %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret <2 x half> %ret } + +define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_bf16 v1, v0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll @@ -4,7 +4,7 @@ declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.atomic.fadd) ; GFX90A-LABEL: {{^}}global_atomic_fadd_f32_rtn: ; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc Index: llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll +++ llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll @@ -28,14 +28,26 @@ ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB1_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst ret void @@ -45,14 +57,26 @@ ; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: s_mov_b64 s[2:3], 0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: flat_load_dword v1, v[0:1] +; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v1, v0 +; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX940-NEXT: s_cbranch_execnz .LBB2_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX940-NEXT: s_endpgm %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst ret void @@ -73,12 +97,25 @@ ; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: flat_load_dword v2, v[0:1] +; GFX940-NEXT: s_mov_b64 s[0:1], 0 +; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start +; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v3, v2 +; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 +; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX940-NEXT: s_cbranch_execnz .LBB4_1 +; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX940-NEXT: v_mov_b32_e32 v0, v2 ; GFX940-NEXT: s_setpc_b64 s[30:31] %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst ret float %ret Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -3,7 +3,9 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s +; RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope -check-prefixes=GFX11 %s + +; GFX11: LLVM ERROR: Cannot select: {{.+}}: f32,ch = AtomicLoadFAdd<(load store syncscope("agent") seq_cst define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32: @@ -113,35 +115,6 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_ret_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void @@ -239,35 +212,6 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -345,18 +289,6 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_noret_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -433,18 +365,6 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -541,35 +461,6 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -683,35 +574,6 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_ret_f32_system: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst store float %result, float addrspace(1)* undef ret void @@ -861,32 +723,6 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_noret_f32_safe: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -961,17 +797,6 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: infer_as_before_atomic: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm %load = load float*, float* addrspace(4)* %arg %v = atomicrmw fadd float* %load, float 1.0 syncscope("agent-one-as") monotonic, align 4 ret void Index: llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll =================================================================== --- llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -122,21 +122,8 @@ ; GFX940-NEXT: ret float [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_unsafe( -; GFX11-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 -; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* -; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX11: atomicrmw.end: -; GFX11-NEXT: ret float [[TMP6]] +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4 +; GFX11-NEXT: ret float [[RES]] ; %res = atomicrmw fadd float addrspace(1)* %ptr, float %value syncscope("wavefront") monotonic ret float %res @@ -297,21 +284,8 @@ ; GFX940-NEXT: ret float [[RES]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat_unsafe( -; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[PTR:%.*]], align 4 -; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP2:%.*]] = bitcast float* [[PTR]] to i32* -; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 -; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("wavefront") monotonic monotonic, align 4 -; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 -; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 -; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float -; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX11: atomicrmw.end: -; GFX11-NEXT: ret float [[TMP6]] +; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[PTR:%.*]], float [[VALUE:%.*]] syncscope("wavefront") monotonic, align 4 +; GFX11-NEXT: ret float [[RES]] ; %res = atomicrmw fadd float* %ptr, float %value syncscope("wavefront") monotonic ret float %res @@ -468,8 +442,21 @@ ; GFX90a-NEXT: ret float [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 -; GFX940-NEXT: ret float [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load float, float* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float* [[PTR]] to i32* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat( ; GFX11-NEXT: [[TMP1:%.*]] = load float, float* [[PTR:%.*]], align 4 @@ -562,8 +549,21 @@ ; GFX90a-NEXT: ret float [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 -; GFX940-NEXT: ret float [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global( ; GFX11-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 @@ -656,7 +656,20 @@ ; GFX90a-NEXT: ret void ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GFX940-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: ; GFX940-NEXT: ret void ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee( @@ -716,7 +729,20 @@ ; GFX9-NEXT: ret void ; ; GFX908-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( -; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GFX908-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX908-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX908-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX908-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: ; GFX908-NEXT: ret void ; ; GFX90a-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( @@ -737,11 +763,37 @@ ; GFX90a-NEXT: ret void ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GFX940-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: ; GFX940-NEXT: ret void ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush( -; GFX11-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4 +; GFX11-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX11-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX11-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX11-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: ; GFX11-NEXT: ret void ; %res = atomicrmw fadd float addrspace(1)* %ptr, float %value seq_cst @@ -947,8 +999,21 @@ ; GFX90a-NEXT: ret double [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f64_flat( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd double* [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8 -; GFX940-NEXT: ret double [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load double, double* [[PTR:%.*]], align 8 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast double* [[PTR]] to i64* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i64* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret double [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat( ; GFX11-NEXT: [[TMP1:%.*]] = load double, double* [[PTR:%.*]], align 8 @@ -1041,8 +1106,21 @@ ; GFX90a-NEXT: ret double [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f64_global( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd double addrspace(1)* [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8 -; GFX940-NEXT: ret double [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load double, double addrspace(1)* [[PTR:%.*]], align 8 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast double addrspace(1)* [[PTR]] to i64 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast double [[NEW]] to i64 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast double [[LOADED]] to i64 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i64 addrspace(1)* [[TMP2]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst, align 8 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i64 [[NEWLOADED]] to double +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret double [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f64_global( ; GFX11-NEXT: [[TMP1:%.*]] = load double, double addrspace(1)* [[PTR:%.*]], align 8 @@ -1216,8 +1294,21 @@ ; GFX90a-NEXT: ret float [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_agent( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] syncscope("agent") monotonic, align 4 -; GFX940-NEXT: ret float [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") monotonic monotonic, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_agent( ; GFX11-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 @@ -1310,8 +1401,21 @@ ; GFX90a-NEXT: ret float [[TMP6]] ; ; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_one_as( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd float addrspace(1)* [[PTR:%.*]], float [[VALUE:%.*]] syncscope("one-as") monotonic, align 4 -; GFX940-NEXT: ret float [[RES]] +; GFX940-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4 +; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX940: atomicrmw.start: +; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ] +; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]] +; GFX940-NEXT: [[TMP2:%.*]] = bitcast float addrspace(1)* [[PTR]] to i32 addrspace(1)* +; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[NEW]] to i32 +; GFX940-NEXT: [[TMP4:%.*]] = bitcast float [[LOADED]] to i32 +; GFX940-NEXT: [[TMP5:%.*]] = cmpxchg i32 addrspace(1)* [[TMP2]], i32 [[TMP4]], i32 [[TMP3]] syncscope("one-as") monotonic monotonic, align 4 +; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0 +; GFX940-NEXT: [[TMP6]] = bitcast i32 [[NEWLOADED]] to float +; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX940: atomicrmw.end: +; GFX940-NEXT: ret float [[TMP6]] ; ; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_one_as( ; GFX11-NEXT: [[TMP1:%.*]] = load float, float addrspace(1)* [[PTR:%.*]], align 4