Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1341,12 +1341,15 @@ } if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); + if (ST.hasGFX940Insts()) + Atomic.legalFor({{S32, FlatPtr}}); + if (AMDGPU::isGFX11Plus(ST)) + Atomic.legalFor({{S32, FlatPtr}}); if (ST.hasGFX90AInsts()) { // These are legal with some caveats, and should have undergone expansion in // the IR in most situations // TODO: Move atomic expansion into legalizer - // TODO: Also supports <2 x f16> Atomic.legalFor({ {S32, GlobalPtr}, {S64, GlobalPtr}, Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4572,7 +4572,8 @@ case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: - case Intrinsic::amdgcn_ds_ordered_swap: { + case Intrinsic::amdgcn_ds_ordered_swap: + case Intrinsic::amdgcn_ds_fadd_v2bf16: { unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12777,6 +12777,13 @@ Subtarget->hasAtomicFaddNoRtnInsts()) { if (Subtarget->hasGFX940Insts()) return AtomicExpansionKind::None; + // Global fadd f32 no-rtn for gfx908 (and gfx11+). + if (!Subtarget->hasGFX90AInsts() && AS == AMDGPUAS::GLOBAL_ADDRESS && + Ty->isFloatTy() && RMW->use_empty()) + return AtomicExpansionKind::None; + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 && + Ty->isFloatTy()) + return AtomicExpansionKind::None; // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe // floating point atomic instructions. May generate more efficient code, Index: llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd-f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd-f32.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/flat-atomic-fadd-f32.ll @@ -1,26 +1,67 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=MI300 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=MI300 %s ; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s 2>&1 | FileCheck -check-prefix=GFX11 %s -; MI300: LLVM ERROR: unable to legalize instruction: %4:_(s32) = G_ATOMICRMW_FADD %0:_(p0), %1:_ :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) (in function: flat_atomic_fadd_f32_no_rtn_atomicrmw) ; GFX11: LLVM ERROR: cannot select: %4:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.flat.atomic.fadd), %0:vgpr(p0), %1:vgpr(s32) :: (volatile dereferenceable load store (s32) on %ir.ptr) (in function: flat_atomic_fadd_f32_no_rtn_intrinsic) define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_intrinsic(float* %ptr, float %data) { + ; MI300-LABEL: name: flat_atomic_fadd_f32_no_rtn_intrinsic + ; MI300: bb.1 (%ir-block.0): + ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI300-NEXT: {{ $}} + ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; MI300-NEXT: S_ENDPGM 0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret void } define amdgpu_ps float @flat_atomic_fadd_f32_rtn_intrinsic(float* %ptr, float %data) { + ; MI300-LABEL: name: flat_atomic_fadd_f32_rtn_intrinsic + ; MI300: bb.1 (%ir-block.0): + ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI300-NEXT: {{ $}} + ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr) + ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p1f32.f32(float* %ptr, float %data) ret float %ret } define amdgpu_ps void @flat_atomic_fadd_f32_no_rtn_atomicrmw(float* %ptr, float %data) #0 { + ; MI300-LABEL: name: flat_atomic_fadd_f32_no_rtn_atomicrmw + ; MI300: bb.1 (%ir-block.0): + ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI300-NEXT: {{ $}} + ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI300-NEXT: FLAT_ATOMIC_ADD_F32 [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; MI300-NEXT: S_ENDPGM 0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret void } define amdgpu_ps float @flat_atomic_fadd_f32_rtn_atomicrmw(float* %ptr, float %data) #0 { + ; MI300-LABEL: name: flat_atomic_fadd_f32_rtn_atomicrmw + ; MI300: bb.1 (%ir-block.0): + ; MI300-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 + ; MI300-NEXT: {{ $}} + ; MI300-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; MI300-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; MI300-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 + ; MI300-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; MI300-NEXT: [[FLAT_ATOMIC_ADD_F32_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_F32_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s32) on %ir.ptr) + ; MI300-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_F32_RTN]] + ; MI300-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 %ret = atomicrmw fadd float* %ptr, float %data syncscope("wavefront") monotonic ret float %ret } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 +; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -global-isel -verify-machineinstrs | FileCheck %s -check-prefix=GFX940 declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0f32.f32(float* %ptr, float %data) declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0v2f16.v2f16(<2 x half>* %ptr, <2 x half> %data) @@ -24,6 +24,40 @@ ret void } +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(float* %ptr) { +; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(float* %ptr) #0 { +; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret void +} + define float @flat_atomic_fadd_f32_rtn(float* %ptr, float %data) { ; GFX940-LABEL: flat_atomic_fadd_f32_rtn: ; GFX940: ; %bb.0: @@ -35,6 +69,21 @@ ret float %ret } +define float @flat_atomic_fadd_f32_rtn_pat(float* %ptr, float %data) { +; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = atomicrmw fadd float* %ptr, float 4.0 seq_cst + ret float %ret +} + define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(<2 x half>* %ptr, <2 x half> %data) { ; GFX940-LABEL: flat_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -60,6 +109,56 @@ ret <2 x half> %ret } +define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(<2 x i16>* %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX940-NEXT: v_mov_b32_e32 v2, s4 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(<2 x i16>* %ptr, <2 x i16> %data) { +; GFX940-LABEL: flat_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0v2i16(<2 x i16>* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: global_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX940-NEXT: v_mov_b32_e32 v1, 0 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s4 +; GFX940-NEXT: global_atomic_pk_add_bf16 v1, v0, s[2:3] +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @global_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: global_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1v2i16(<2 x i16> addrspace(1)* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(<2 x half> addrspace(3)* %ptr, <2 x half> %data) { ; GFX940-LABEL: local_atomic_fadd_v2f16_noret: ; GFX940: ; %bb.0: @@ -83,3 +182,36 @@ %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(<2 x half> addrspace(3)* %ptr, <2 x half> %data, i32 0, i32 0, i1 0) ret <2 x half> %ret } + +define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_noret: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX940-NEXT: s_waitcnt lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v0, s1 +; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_bf16 v1, v0 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_endpgm + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) + ret void +} + +define <2 x i16> @local_atomic_fadd_v2bf16_rtn(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) { +; GFX940-LABEL: local_atomic_fadd_v2bf16_rtn: +; GFX940: ; %bb.0: +; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_wbl2 sc0 sc1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1 +; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX940-NEXT: buffer_inv sc0 sc1 +; GFX940-NEXT: s_setpc_b64 s[30:31] + %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(<2 x i16> addrspace(3)* %ptr, <2 x i16> %data) + ret <2 x i16> %ret +} + +attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd-with-ret.ll @@ -4,7 +4,7 @@ declare float @llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32(float addrspace(1)* nocapture, float) declare <2 x half> @llvm.amdgcn.global.atomic.fadd.f32.p1v2f16.v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) -; GFX908: error: {{.*}} return versions of fp atomics not supported +; GFX908: LLVM ERROR: cannot select: %4:vgpr_32(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.global.atomic.fadd), %0:vgpr(p1), %1:vgpr(s32) :: (volatile dereferenceable load store (s32) on %ir.ptr, addrspace 1) (in function: global_atomic_fadd_f32_rtn) ; GFX90A-LABEL: {{^}}global_atomic_fadd_f32_rtn: ; GFX90A: global_atomic_add_f32 v0, v[0:1], v2, off glc Index: llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -3,7 +3,9 @@ ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s +; RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s 2>&1 | FileCheck -enable-var-scope -check-prefixes=GFX11 %s + +; GFX11: LLVM ERROR: Cannot select: t15: f32,ch = AtomicLoadFAdd<(load store seq_cst (s32) on %ir.ptr.load, addrspace 1)> t0, t21, ConstantFP:f32<4.000000e+00> define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { ; GFX900-LABEL: global_atomic_fadd_ret_f32: @@ -113,35 +115,6 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_ret_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB0_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void @@ -239,35 +212,6 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB1_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -345,18 +289,6 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB2_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_noret_f32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -433,18 +365,6 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB3_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -541,35 +461,6 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB4_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst store float %result, float addrspace(1)* undef ret void @@ -683,35 +574,6 @@ ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_ret_f32_system: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_add_f32_e32 v1, 4.0, v2 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB5_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: global_store_b32 v[0:1], v1, off -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst store float %result, float addrspace(1)* undef ret void @@ -791,25 +653,12 @@ ; GFX908-LABEL: global_atomic_fadd_noret_f32_safe: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 -; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz .LBB8_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_noret_f32_safe: @@ -861,32 +710,6 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB8_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: global_atomic_fadd_noret_f32_safe: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v1, s2 -; GFX11-NEXT: s_mov_b32 s2, 0 -; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start -; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: buffer_gl1_inv -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_mov_b32_e32 v1, v0 -; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 -; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 -; GFX11-NEXT: s_cbranch_execnz .LBB8_1 -; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX11-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst ret void } @@ -961,17 +784,6 @@ ; GFX10-NEXT: s_cbranch_execnz .LBB9_1 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm -; -; GFX11-LABEL: infer_as_before_atomic: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm %load = load float*, float* addrspace(4)* %arg %v = atomicrmw fadd float* %load, float 1.0 syncscope("agent-one-as") monotonic, align 4 ret void