diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -144,7 +144,8 @@ bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const; bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const; bool selectAMDGPU_BUFFER_ATOMIC_FADD(MachineInstr &I) const; - bool selectGlobalAtomicFaddIntrinsic(MachineInstr &I) const; + bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp, + MachineOperand &DataOp) const; bool selectBVHIntrinsic(MachineInstr &I) const; std::pair selectVOP3ModsImpl(MachineOperand &Root, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1749,7 +1749,7 @@ case Intrinsic::amdgcn_s_barrier: return selectSBarrier(I); case Intrinsic::amdgcn_global_atomic_fadd: - return selectGlobalAtomicFaddIntrinsic(I); + return selectGlobalAtomicFadd(I, I.getOperand(2), I.getOperand(3)); default: { return selectImpl(I, *CoverageInfo); } @@ -2352,6 +2352,13 @@ bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW( MachineInstr &I) const { + if (I.getOpcode() == TargetOpcode::G_ATOMICRMW_FADD) { + const LLT PtrTy = MRI->getType(I.getOperand(1).getReg()); + unsigned AS = PtrTy.getAddressSpace(); + if (AS == AMDGPUAS::GLOBAL_ADDRESS) + return selectGlobalAtomicFadd(I, I.getOperand(1), I.getOperand(2)); + } + initM0(I); return selectImpl(I, *CoverageInfo); } @@ -2994,11 +3001,14 @@ return true; } -bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic( - MachineInstr &MI) const{ +bool AMDGPUInstructionSelector::selectGlobalAtomicFadd( + MachineInstr &MI, MachineOperand &AddrOp, MachineOperand &DataOp) const { - if (STI.hasGFX90AInsts()) + if (STI.hasGFX90AInsts()) { + // gfx90a adds return versions of the global atomic fadd instructions so no + // special handling is required. return selectImpl(MI, *CoverageInfo); + } MachineBasicBlock *MBB = MI.getParent(); const DebugLoc &DL = MI.getDebugLoc(); @@ -3015,9 +3025,9 @@ // FIXME: This is only needed because tablegen requires number of dst operands // in match and replace pattern to be the same. Otherwise patterns can be // exported from SDag path. - auto Addr = selectFlatOffsetImpl(MI.getOperand(2)); + auto Addr = selectFlatOffsetImpl(AddrOp); - Register Data = MI.getOperand(3).getReg(); + Register Data = DataOp.getReg(); const unsigned Opc = MRI->getType(Data).isVector() ? AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32; auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1289,12 +1289,14 @@ Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } + auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); if (ST.hasLDSFPAtomics()) { - auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD) - .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); + Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); if (ST.hasGFX90AInsts()) Atomic.legalFor({{S64, LocalPtr}}); } + if (ST.hasAtomicFaddInsts()) + Atomic.legalFor({{S32, GlobalPtr}}); // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output // demarshalling diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-global.mir @@ -0,0 +1,22 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 -O0 -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a -O0 -run-pass=legalizer %s -o - | FileCheck %s + +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel-abort=2 -pass-remarks-missed='gisel.*' -o /dev/null %s 2>&1 | FileCheck -check-prefix=ERR %s + +# ERR: remark: :0:0: unable to legalize instruction: %2:_(s32) = G_ATOMICRMW_FADD %0:_(p1), %1:_ :: (load store seq_cst 4, addrspace 1) (in function: atomicrmw_fadd_global_i32) + +--- +name: atomicrmw_fadd_global_i32 + +body: | + bb.0: + liveins: $sgpr0_sgpr1, $sgpr2 + ; CHECK-LABEL: name: atomicrmw_fadd_global_i32 + ; CHECK: [[COPY:%[0-9]+]]:_(p1) = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2 + ; CHECK: [[ATOMICRMW_FADD:%[0-9]+]]:_(s32) = G_ATOMICRMW_FADD [[COPY]](p1), [[COPY1]] :: (load store seq_cst 4, addrspace 1) + %0:_(p1) = COPY $sgpr0_sgpr1 + %1:_(s32) = COPY $sgpr2 + %2:_(s32) = G_ATOMICRMW_FADD %0, %1 :: (load store seq_cst 4, addrspace 1) +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir rename from llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd.mir rename to llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomicrmw-fadd-local.mir diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -1,340 +1,656 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SDAG,SDAG-GFX900 %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SDAG,SDAG-GFX908 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SDAG,SDAG-GFX90A %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GISEL,GISEL-GFX900 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GISEL,GISEL-GFX908 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GISEL,GISEL-GFX90A %s define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr) #0 { -; GFX900-LABEL: global_atomic_fadd_ret_f32: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v0, s4 -; GFX900-NEXT: BB0_1: ; %atomicrmw.start -; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz BB0_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_endpgm +; SDAG-GFX900-LABEL: global_atomic_fadd_ret_f32: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX900-NEXT: s_mov_b64 s[2:3], 0 +; SDAG-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; SDAG-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-GFX900-NEXT: BB0_1: ; %atomicrmw.start +; SDAG-GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v0 +; SDAG-GFX900-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX900-NEXT: buffer_wbinvl1_vol +; SDAG-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SDAG-GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SDAG-GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SDAG-GFX900-NEXT: s_cbranch_execnz BB0_1 +; SDAG-GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; SDAG-GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; SDAG-GFX900-NEXT: global_store_dword v[0:1], v0, off +; SDAG-GFX900-NEXT: s_endpgm ; -; GFX908-LABEL: global_atomic_fadd_ret_f32: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s4 -; GFX908-NEXT: BB0_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 -; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz BB0_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v0, off -; GFX908-NEXT: s_endpgm +; SDAG-GFX908-LABEL: global_atomic_fadd_ret_f32: +; SDAG-GFX908: ; %bb.0: +; SDAG-GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX908-NEXT: s_mov_b64 s[2:3], 0 +; SDAG-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; SDAG-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX908-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-GFX908-NEXT: BB0_1: ; %atomicrmw.start +; SDAG-GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX908-NEXT: v_mov_b32_e32 v1, v0 +; SDAG-GFX908-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; SDAG-GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; SDAG-GFX908-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX908-NEXT: buffer_wbinvl1_vol +; SDAG-GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SDAG-GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SDAG-GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SDAG-GFX908-NEXT: s_cbranch_execnz BB0_1 +; SDAG-GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; SDAG-GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; SDAG-GFX908-NEXT: global_store_dword v[0:1], v0, off +; SDAG-GFX908-NEXT: s_endpgm ; -; GFX90A-LABEL: global_atomic_fadd_ret_f32: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: global_store_dword v[0:1], v0, off -; GFX90A-NEXT: s_endpgm +; SDAG-GFX90A-LABEL: global_atomic_fadd_ret_f32: +; SDAG-GFX90A: ; %bb.0: +; SDAG-GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; SDAG-GFX90A-NEXT: buffer_wbl2 +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc scc +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX90A-NEXT: buffer_invl2 +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX90A-NEXT: buffer_wbinvl1_vol +; SDAG-GFX90A-NEXT: global_store_dword v[0:1], v0, off +; SDAG-GFX90A-NEXT: s_endpgm +; +; GISEL-GFX900-LABEL: global_atomic_fadd_ret_f32: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GISEL-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GISEL-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-GFX900-NEXT: BB0_1: ; %atomicrmw.start +; GISEL-GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX900-NEXT: buffer_wbinvl1_vol +; GISEL-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GISEL-GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GISEL-GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GISEL-GFX900-NEXT: s_cbranch_execnz BB0_1 +; GISEL-GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GISEL-GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL-GFX900-NEXT: global_store_dword v[0:1], v0, off +; GISEL-GFX900-NEXT: s_endpgm +; +; GISEL-GFX908-LABEL: global_atomic_fadd_ret_f32: +; GISEL-GFX908: ; %bb.0: +; GISEL-GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GISEL-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GISEL-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-GFX908-NEXT: BB0_1: ; %atomicrmw.start +; GISEL-GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GISEL-GFX908-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GISEL-GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GISEL-GFX908-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX908-NEXT: buffer_wbinvl1_vol +; GISEL-GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GISEL-GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GISEL-GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GISEL-GFX908-NEXT: s_cbranch_execnz BB0_1 +; GISEL-GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GISEL-GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL-GFX908-NEXT: global_store_dword v[0:1], v0, off +; GISEL-GFX908-NEXT: s_endpgm +; +; GISEL-GFX90A-LABEL: global_atomic_fadd_ret_f32: +; GISEL-GFX90A: ; %bb.0: +; GISEL-GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX90A-NEXT: v_mov_b32_e32 v0, 4.0 +; GISEL-GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX90A-NEXT: buffer_wbl2 +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX90A-NEXT: global_atomic_add_f32 v0, v1, v0, s[0:1] glc scc +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX90A-NEXT: buffer_invl2 +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX90A-NEXT: buffer_wbinvl1_vol +; GISEL-GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GISEL-GFX90A-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void } define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %ptr) #2 { -; GFX900-LABEL: global_atomic_fadd_ret_f32_ieee: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v0, s4 -; GFX900-NEXT: BB1_1: ; %atomicrmw.start -; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz BB1_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX900-NEXT: global_store_dword v[0:1], v0, off -; GFX900-NEXT: s_endpgm +; SDAG-GFX900-LABEL: global_atomic_fadd_ret_f32_ieee: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX900-NEXT: s_mov_b64 s[2:3], 0 +; SDAG-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; SDAG-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-GFX900-NEXT: BB1_1: ; %atomicrmw.start +; SDAG-GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v0 +; SDAG-GFX900-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX900-NEXT: buffer_wbinvl1_vol +; SDAG-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SDAG-GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SDAG-GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SDAG-GFX900-NEXT: s_cbranch_execnz BB1_1 +; SDAG-GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; SDAG-GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; SDAG-GFX900-NEXT: global_store_dword v[0:1], v0, off +; SDAG-GFX900-NEXT: s_endpgm +; +; SDAG-GFX908-LABEL: global_atomic_fadd_ret_f32_ieee: +; SDAG-GFX908: ; %bb.0: +; SDAG-GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX908-NEXT: s_mov_b64 s[2:3], 0 +; SDAG-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; SDAG-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX908-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-GFX908-NEXT: BB1_1: ; %atomicrmw.start +; SDAG-GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX908-NEXT: v_mov_b32_e32 v1, v0 +; SDAG-GFX908-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; SDAG-GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; SDAG-GFX908-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX908-NEXT: buffer_wbinvl1_vol +; SDAG-GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SDAG-GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SDAG-GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SDAG-GFX908-NEXT: s_cbranch_execnz BB1_1 +; SDAG-GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; SDAG-GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; SDAG-GFX908-NEXT: global_store_dword v[0:1], v0, off +; SDAG-GFX908-NEXT: s_endpgm ; -; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s4 -; GFX908-NEXT: BB1_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 -; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz BB1_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX908-NEXT: global_store_dword v[0:1], v0, off -; GFX908-NEXT: s_endpgm +; SDAG-GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: +; SDAG-GFX90A: ; %bb.0: +; SDAG-GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; SDAG-GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; SDAG-GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-GFX90A-NEXT: BB1_1: ; %atomicrmw.start +; SDAG-GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; SDAG-GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; SDAG-GFX90A-NEXT: buffer_wbl2 +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX90A-NEXT: buffer_invl2 +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX90A-NEXT: buffer_wbinvl1_vol +; SDAG-GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SDAG-GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SDAG-GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SDAG-GFX90A-NEXT: s_cbranch_execnz BB1_1 +; SDAG-GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; SDAG-GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; SDAG-GFX90A-NEXT: global_store_dword v[0:1], v0, off +; SDAG-GFX90A-NEXT: s_endpgm ; -; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s4 -; GFX90A-NEXT: BB1_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz BB1_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX90A-NEXT: global_store_dword v[0:1], v0, off -; GFX90A-NEXT: s_endpgm +; GISEL-GFX900-LABEL: global_atomic_fadd_ret_f32_ieee: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GISEL-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GISEL-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-GFX900-NEXT: BB1_1: ; %atomicrmw.start +; GISEL-GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX900-NEXT: buffer_wbinvl1_vol +; GISEL-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GISEL-GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GISEL-GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GISEL-GFX900-NEXT: s_cbranch_execnz BB1_1 +; GISEL-GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GISEL-GFX900-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL-GFX900-NEXT: global_store_dword v[0:1], v0, off +; GISEL-GFX900-NEXT: s_endpgm +; +; GISEL-GFX908-LABEL: global_atomic_fadd_ret_f32_ieee: +; GISEL-GFX908: ; %bb.0: +; GISEL-GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GISEL-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GISEL-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX908-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-GFX908-NEXT: BB1_1: ; %atomicrmw.start +; GISEL-GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GISEL-GFX908-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GISEL-GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GISEL-GFX908-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX908-NEXT: buffer_wbinvl1_vol +; GISEL-GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GISEL-GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GISEL-GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GISEL-GFX908-NEXT: s_cbranch_execnz BB1_1 +; GISEL-GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GISEL-GFX908-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL-GFX908-NEXT: global_store_dword v[0:1], v0, off +; GISEL-GFX908-NEXT: s_endpgm +; +; GISEL-GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: +; GISEL-GFX90A: ; %bb.0: +; GISEL-GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GISEL-GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; GISEL-GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX90A-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-GFX90A-NEXT: BB1_1: ; %atomicrmw.start +; GISEL-GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GISEL-GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GISEL-GFX90A-NEXT: buffer_wbl2 +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX90A-NEXT: buffer_invl2 +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX90A-NEXT: buffer_wbinvl1_vol +; GISEL-GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GISEL-GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GISEL-GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GISEL-GFX90A-NEXT: s_cbranch_execnz BB1_1 +; GISEL-GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GISEL-GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL-GFX90A-NEXT: global_store_dword v[0:1], v0, off +; GISEL-GFX90A-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void } define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr) #0 { -; GFX900-LABEL: global_atomic_fadd_noret_f32: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: BB2_1: ; %atomicrmw.start -; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz BB2_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_endpgm +; SDAG-GFX900-LABEL: global_atomic_fadd_noret_f32: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX900-NEXT: s_mov_b64 s[2:3], 0 +; SDAG-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; SDAG-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, s4 +; SDAG-GFX900-NEXT: BB2_1: ; %atomicrmw.start +; SDAG-GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX900-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX900-NEXT: buffer_wbinvl1_vol +; SDAG-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SDAG-GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v0 +; SDAG-GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SDAG-GFX900-NEXT: s_cbranch_execnz BB2_1 +; SDAG-GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; SDAG-GFX900-NEXT: s_endpgm +; +; SDAG-GFX908-LABEL: global_atomic_fadd_noret_f32: +; SDAG-GFX908: ; %bb.0: +; SDAG-GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX908-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; SDAG-GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; SDAG-GFX908-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX908-NEXT: buffer_wbinvl1_vol +; SDAG-GFX908-NEXT: s_endpgm ; -; GFX908-LABEL: global_atomic_fadd_noret_f32: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: s_endpgm +; SDAG-GFX90A-LABEL: global_atomic_fadd_noret_f32: +; SDAG-GFX90A: ; %bb.0: +; SDAG-GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; SDAG-GFX90A-NEXT: buffer_wbl2 +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] scc +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX90A-NEXT: buffer_invl2 +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX90A-NEXT: buffer_wbinvl1_vol +; SDAG-GFX90A-NEXT: s_endpgm ; -; GFX90A-LABEL: global_atomic_fadd_noret_f32: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: s_endpgm +; GISEL-GFX900-LABEL: global_atomic_fadd_noret_f32: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GISEL-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GISEL-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, s4 +; GISEL-GFX900-NEXT: BB2_1: ; %atomicrmw.start +; GISEL-GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX900-NEXT: buffer_wbinvl1_vol +; GISEL-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GISEL-GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GISEL-GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GISEL-GFX900-NEXT: s_cbranch_execnz BB2_1 +; GISEL-GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GISEL-GFX900-NEXT: s_endpgm +; +; GISEL-GFX908-LABEL: global_atomic_fadd_noret_f32: +; GISEL-GFX908: ; %bb.0: +; GISEL-GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX908-NEXT: v_mov_b32_e32 v2, 4.0 +; GISEL-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX908-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-GFX908-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GISEL-GFX908-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX908-NEXT: buffer_wbinvl1_vol +; GISEL-GFX908-NEXT: s_endpgm +; +; GISEL-GFX90A-LABEL: global_atomic_fadd_noret_f32: +; GISEL-GFX90A: ; %bb.0: +; GISEL-GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX90A-NEXT: v_mov_b32_e32 v0, 4.0 +; GISEL-GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-GFX90A-NEXT: buffer_wbl2 +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX90A-NEXT: global_atomic_add_f32 v0, v1, v0, s[0:1] glc scc +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX90A-NEXT: buffer_invl2 +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX90A-NEXT: buffer_wbinvl1_vol +; GISEL-GFX90A-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst ret void } define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)* %ptr) #2 { -; GFX900-LABEL: global_atomic_fadd_noret_f32_ieee: -; GFX900: ; %bb.0: -; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX900-NEXT: s_mov_b64 s[2:3], 0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX900-NEXT: s_waitcnt lgkmcnt(0) -; GFX900-NEXT: v_mov_b32_e32 v1, s4 -; GFX900-NEXT: BB3_1: ; %atomicrmw.start -; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX900-NEXT: v_mov_b32_e32 v2, 0 -; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: buffer_wbinvl1_vol -; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX900-NEXT: v_mov_b32_e32 v1, v0 -; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX900-NEXT: s_cbranch_execnz BB3_1 -; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX900-NEXT: s_endpgm +; SDAG-GFX900-LABEL: global_atomic_fadd_noret_f32_ieee: +; SDAG-GFX900: ; %bb.0: +; SDAG-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX900-NEXT: s_mov_b64 s[2:3], 0 +; SDAG-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; SDAG-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, s4 +; SDAG-GFX900-NEXT: BB3_1: ; %atomicrmw.start +; SDAG-GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX900-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX900-NEXT: buffer_wbinvl1_vol +; SDAG-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SDAG-GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SDAG-GFX900-NEXT: v_mov_b32_e32 v1, v0 +; SDAG-GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SDAG-GFX900-NEXT: s_cbranch_execnz BB3_1 +; SDAG-GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; SDAG-GFX900-NEXT: s_endpgm +; +; SDAG-GFX908-LABEL: global_atomic_fadd_noret_f32_ieee: +; SDAG-GFX908: ; %bb.0: +; SDAG-GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX908-NEXT: s_mov_b64 s[2:3], 0 +; SDAG-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; SDAG-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX908-NEXT: v_mov_b32_e32 v1, s4 +; SDAG-GFX908-NEXT: BB3_1: ; %atomicrmw.start +; SDAG-GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX908-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; SDAG-GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; SDAG-GFX908-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX908-NEXT: buffer_wbinvl1_vol +; SDAG-GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SDAG-GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SDAG-GFX908-NEXT: v_mov_b32_e32 v1, v0 +; SDAG-GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SDAG-GFX908-NEXT: s_cbranch_execnz BB3_1 +; SDAG-GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; SDAG-GFX908-NEXT: s_endpgm +; +; SDAG-GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee: +; SDAG-GFX90A: ; %bb.0: +; SDAG-GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; SDAG-GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; SDAG-GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; SDAG-GFX90A-NEXT: BB3_1: ; %atomicrmw.start +; SDAG-GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; SDAG-GFX90A-NEXT: buffer_wbl2 +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX90A-NEXT: buffer_invl2 +; SDAG-GFX90A-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX90A-NEXT: buffer_wbinvl1_vol +; SDAG-GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SDAG-GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SDAG-GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; SDAG-GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SDAG-GFX90A-NEXT: s_cbranch_execnz BB3_1 +; SDAG-GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; SDAG-GFX90A-NEXT: s_endpgm ; -; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX908-NEXT: s_mov_b64 s[2:3], 0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v1, s4 -; GFX908-NEXT: BB3_1: ; %atomicrmw.start -; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX908-NEXT: v_mov_b32_e32 v2, 0 -; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_wbinvl1_vol -; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX908-NEXT: v_mov_b32_e32 v1, v0 -; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX908-NEXT: s_cbranch_execnz BB3_1 -; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX908-NEXT: s_endpgm +; GISEL-GFX900-LABEL: global_atomic_fadd_noret_f32_ieee: +; GISEL-GFX900: ; %bb.0: +; GISEL-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX900-NEXT: s_mov_b64 s[2:3], 0 +; GISEL-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX900-NEXT: s_load_dword s4, s[0:1], 0x0 +; GISEL-GFX900-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, s4 +; GISEL-GFX900-NEXT: BB3_1: ; %atomicrmw.start +; GISEL-GFX900-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GISEL-GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX900-NEXT: buffer_wbinvl1_vol +; GISEL-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GISEL-GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v0 +; GISEL-GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GISEL-GFX900-NEXT: s_cbranch_execnz BB3_1 +; GISEL-GFX900-NEXT: ; %bb.2: ; %atomicrmw.end +; GISEL-GFX900-NEXT: s_endpgm ; -; GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: s_mov_b64 s[2:3], 0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v1, s4 -; GFX90A-NEXT: BB3_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_invl2 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX90A-NEXT: v_mov_b32_e32 v1, v0 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX90A-NEXT: s_cbranch_execnz BB3_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_endpgm +; GISEL-GFX908-LABEL: global_atomic_fadd_noret_f32_ieee: +; GISEL-GFX908: ; %bb.0: +; GISEL-GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX908-NEXT: s_mov_b64 s[2:3], 0 +; GISEL-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX908-NEXT: s_load_dword s4, s[0:1], 0x0 +; GISEL-GFX908-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX908-NEXT: v_mov_b32_e32 v1, s4 +; GISEL-GFX908-NEXT: BB3_1: ; %atomicrmw.start +; GISEL-GFX908-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GISEL-GFX908-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GISEL-GFX908-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX908-NEXT: buffer_wbinvl1_vol +; GISEL-GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GISEL-GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GISEL-GFX908-NEXT: v_mov_b32_e32 v1, v0 +; GISEL-GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GISEL-GFX908-NEXT: s_cbranch_execnz BB3_1 +; GISEL-GFX908-NEXT: ; %bb.2: ; %atomicrmw.end +; GISEL-GFX908-NEXT: s_endpgm +; +; GISEL-GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee: +; GISEL-GFX90A: ; %bb.0: +; GISEL-GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-GFX90A-NEXT: s_mov_b64 s[2:3], 0 +; GISEL-GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0 +; GISEL-GFX90A-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-GFX90A-NEXT: v_mov_b32_e32 v1, s4 +; GISEL-GFX90A-NEXT: BB3_1: ; %atomicrmw.start +; GISEL-GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GISEL-GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-GFX90A-NEXT: buffer_wbl2 +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX90A-NEXT: buffer_invl2 +; GISEL-GFX90A-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX90A-NEXT: buffer_wbinvl1_vol +; GISEL-GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GISEL-GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GISEL-GFX90A-NEXT: v_mov_b32_e32 v1, v0 +; GISEL-GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GISEL-GFX90A-NEXT: s_cbranch_execnz BB3_1 +; GISEL-GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GISEL-GFX90A-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst ret void } define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 { -; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: s_mov_b64 s[2:3], 0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: BB4_1: ; %atomicrmw.start -; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: v_add_f32_e32 v0, 4.0, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_wbinvl1_vol -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GCN-NEXT: s_cbranch_execnz BB4_1 -; GCN-NEXT: ; %bb.2: ; %atomicrmw.end -; GCN-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_endpgm +; SDAG-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-NEXT: s_mov_b64 s[2:3], 0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: s_load_dword s4, s[0:1], 0x0 +; SDAG-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-NEXT: v_mov_b32_e32 v0, s4 +; SDAG-NEXT: BB4_1: ; %atomicrmw.start +; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; SDAG-NEXT: v_mov_b32_e32 v1, v0 +; SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-NEXT: v_add_f32_e32 v0, 4.0, v1 +; SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_wbinvl1_vol +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SDAG-NEXT: s_cbranch_execnz BB4_1 +; SDAG-NEXT: ; %bb.2: ; %atomicrmw.end +; SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; SDAG-NEXT: global_store_dword v[0:1], v0, off +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-NEXT: s_mov_b64 s[2:3], 0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: s_load_dword s4, s[0:1], 0x0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: BB4_1: ; %atomicrmw.start +; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GISEL-NEXT: v_mov_b32_e32 v1, v0 +; GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GISEL-NEXT: v_add_f32_e32 v0, 4.0, v1 +; GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_wbinvl1_vol +; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GISEL-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GISEL-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GISEL-NEXT: s_cbranch_execnz BB4_1 +; GISEL-NEXT: ; %bb.2: ; %atomicrmw.end +; GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL-NEXT: global_store_dword v[0:1], v0, off +; GISEL-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst store float %result, float addrspace(1)* undef ret void } define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float addrspace(1)* %ptr) #1 { -; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: -; GCN: ; %bb.0: -; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 4.0 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_wbinvl1_vol -; GCN-NEXT: s_endpgm +; SDAG-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; SDAG-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, 4.0 +; SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SDAG-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; SDAG-NEXT: s_waitcnt vmcnt(0) +; SDAG-NEXT: buffer_wbinvl1_vol +; SDAG-NEXT: s_endpgm +; +; GISEL-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GISEL-NEXT: v_mov_b32_e32 v2, 4.0 +; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: v_mov_b32_e32 v1, s1 +; GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GISEL-NEXT: global_atomic_add_f32 v[0:1], v2, off +; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: buffer_wbinvl1_vol +; GISEL-NEXT: s_endpgm %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst ret void }