Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -119,6 +119,12 @@ def : GINodeEquiv; def : GINodeEquiv; +// FIXME: Check MMO is atomic +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; + class GISelSop2Pat < SDPatternOperator node, Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1956,6 +1956,10 @@ return selectG_PTR_MASK(I); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return selectG_EXTRACT_VECTOR_ELT(I); + case AMDGPU::G_AMDGPU_ATOMIC_INC: + case AMDGPU::G_AMDGPU_ATOMIC_DEC: + initM0(I); + return selectImpl(I, *CoverageInfo); default: return selectImpl(I, *CoverageInfo); } Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h @@ -106,6 +106,10 @@ Register Reg) const; bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat) const; + + bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, + bool IsInc) const; + bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const override; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -2352,6 +2352,22 @@ return Ty == S32; } +bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, + MachineIRBuilder &B, + bool IsInc) const { + B.setInstr(MI); + unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : + AMDGPU::G_AMDGPU_ATOMIC_DEC; + B.buildInstr(Opc) + .addDef(MI.getOperand(0).getReg()) + .addUse(MI.getOperand(2).getReg()) + .addUse(MI.getOperand(3).getReg()) + .cloneMemRefs(MI); + MI.eraseFromParent(); + return true; +} + +// FIMXE: Needs observer like custom bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -2470,6 +2486,10 @@ return legalizeRawBufferStore(MI, MRI, B, false); case Intrinsic::amdgcn_raw_buffer_store_format: return legalizeRawBufferStore(MI, MRI, B, true); + case Intrinsic::amdgcn_atomic_inc: + return legalizeAtomicIncDec(MI, B, true); + case Intrinsic::amdgcn_atomic_dec: + return legalizeAtomicIncDec(MI, B, false); default: return true; } Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3145,8 +3145,6 @@ case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: return getDefaultMappingAllVGPR(MI); case Intrinsic::amdgcn_ds_ordered_add: case Intrinsic::amdgcn_ds_ordered_swap: { @@ -3342,7 +3340,9 @@ case AMDGPU::G_ATOMICRMW_UMIN: case AMDGPU::G_ATOMICRMW_FADD: case AMDGPU::G_ATOMIC_CMPXCHG: - case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: { + case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: + case AMDGPU::G_AMDGPU_ATOMIC_INC: + case AMDGPU::G_AMDGPU_ATOMIC_DEC: { return getDefaultMappingAllVGPR(MI); } case AMDGPU::G_BRCOND: { Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2150,8 +2150,13 @@ // operands. def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { let OutOperandList = (outs type0:$oldval); - let InOperandList = (ins ptype1:$addr, type0:$cmpval_nnenwval); + let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval); let hasSideEffects = 0; let mayLoad = 1; let mayStore = 1; } + +let Namespace = "AMDGPU" in { +def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP; +def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP; +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -0,0 +1,1859 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s + +; FIXME: Merge with other test. DS offset folding doesn't work due to +; register bank copies, and no return optimization is missing. + + +declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #2 + +declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* nocapture, i64, i32, i32, i1) #2 + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { +; CI-LABEL: lds_atomic_dec_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_dec_rtn_u32 v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { +; CI-LABEL: lds_atomic_dec_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_dec_rtn_u32 v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { +; CI-LABEL: lds_atomic_dec_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_dec_rtn_u32 v0, v1, v0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_dec_rtn_u32 v0, v1, v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_dec_rtn_u32 v0, v0, v1 +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +; CI-LABEL: lds_atomic_dec_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_dec_rtn_u32 v0, v1, v0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_dec_rtn_u32 v0, v1, v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_dec_rtn_u32 v0, v0, v1 +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s2, s2, 16 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind { +; CI-LABEL: global_atomic_dec_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { +; CI-LABEL: global_atomic_dec_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 16 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 4, v1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 4, v0 +; CI-NEXT: v_mul_lo_u32 v4, 4, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 4, v1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 4, v0 +; VI-NEXT: v_mul_lo_u32 v4, 4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 4, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: flat_atomic_dec v0, v[0:1], v4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id + %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32 addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 4, v1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 4, v0 +; CI-NEXT: v_mul_lo_u32 v0, 4, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 4, v1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 4, v0 +; VI-NEXT: v_mul_lo_u32 v0, 4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32* %out, i32* %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32* %out, i32* %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s2, s2, 16 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32* %ptr) nounwind { +; CI-LABEL: flat_atomic_dec_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32* %ptr) nounwind { +; CI-LABEL: flat_atomic_dec_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 16 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32* %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 4, v1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 4, v0 +; CI-NEXT: v_mul_lo_u32 v4, 4, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 4, v1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 4, v0 +; VI-NEXT: v_mul_lo_u32 v4, 4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 4, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: flat_atomic_dec v0, v[0:1], v4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32* %ptr, i32 %id + %out.gep = getelementptr i32, i32* %out, i32 %id + %gep = getelementptr i32, i32* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out.gep + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0 { +; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 4, v1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 4, v0 +; CI-NEXT: v_mul_lo_u32 v0, 4, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 4, v1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 4, v0 +; VI-NEXT: v_mul_lo_u32 v0, 4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32* %ptr, i32 %id + %gep = getelementptr i32, i32* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64* %out, i64* %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64* %out, i64* %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s2, s2, 32 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64* %ptr) nounwind { +; CI-LABEL: flat_atomic_dec_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64* %ptr) nounwind { +; CI-LABEL: flat_atomic_dec_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 32 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 { +; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 8, v1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 8, v0 +; CI-NEXT: v_mul_lo_u32 v4, 8, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_mov_b32_e32 v5, 0 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[4:5] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 8, v1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 8, v0 +; VI-NEXT: v_mul_lo_u32 v4, 8, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, 8, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64* %ptr, i32 %id + %out.gep = getelementptr i64, i64* %out, i32 %id + %gep = getelementptr i64, i64* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out.gep + ret void +} + +define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 { +; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 8, v1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 8, v0 +; CI-NEXT: v_mul_lo_u32 v0, 8, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 8, v1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 8, v0 +; VI-NEXT: v_mul_lo_u32 v0, 8, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64* %ptr, i32 %id + %gep = getelementptr i64, i64* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + ret void +} + +@lds0 = addrspace(3) global [512 x i32] undef + +define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; CI-LABEL: atomic_dec_shl_base_lds_0: +; CI: ; %bb.0: +; CI-NEXT: v_mul_lo_u32 v5, 4, v0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v6, 9 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0, v5 +; CI-NEXT: v_add_i32_e32 v5, vcc, 8, v5 +; CI-NEXT: ds_dec_rtn_u32 v5, v5, v6 +; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dword v[0:1], v4 +; CI-NEXT: flat_store_dword v[2:3], v5 +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_dec_shl_base_lds_0: +; VI: ; %bb.0: +; VI-NEXT: v_mul_lo_u32 v5, 4, v0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0, v5 +; VI-NEXT: v_add_u32_e32 v5, vcc, 8, v5 +; VI-NEXT: ds_dec_rtn_u32 v5, v5, v6 +; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dword v[2:3], v5 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_dec_shl_base_lds_0: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: v_add_u32_e32 v0, 0, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 +; GFX9-NEXT: ds_dec_rtn_u32 v2, v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false) + store i32 %idx.0, i32 addrspace(1)* %add_use + store i32 %val0, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { +; CI-LABEL: lds_atomic_dec_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { +; CI-LABEL: lds_atomic_dec_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 32 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { +; CI-LABEL: lds_atomic_dec_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +; CI-LABEL: lds_atomic_dec_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_dec_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_dec_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 32 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s2, s2, 32 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind { +; CI-LABEL: global_atomic_dec_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { +; CI-LABEL: global_atomic_dec_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 32 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 8, v1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 8, v0 +; CI-NEXT: v_mul_lo_u32 v4, 8, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_mov_b32_e32 v5, 0 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[4:5] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 8, v1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 8, v0 +; VI-NEXT: v_mul_lo_u32 v4, 8, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, 8, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id + %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64 addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 8, v1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 8, v0 +; CI-NEXT: v_mul_lo_u32 v0, 8, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 8, v1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 8, v0 +; VI-NEXT: v_mul_lo_u32 v0, 8, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id + %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) + ret void +} + +@lds1 = addrspace(3) global [512 x i64] undef, align 8 + +define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; CI-LABEL: atomic_dec_shl_base_lds_0_i64: +; CI: ; %bb.0: +; CI-NEXT: v_mul_lo_u32 v7, 8, v0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_add_i32_e32 v6, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v0, 9 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0, v7 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 16, v7 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v7, v[0:1] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: flat_store_dword v[2:3], v6 +; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_dec_shl_base_lds_0_i64: +; VI: ; %bb.0: +; VI-NEXT: v_mul_lo_u32 v7, 8, v0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v0, 9 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0, v7 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 16, v7 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v7, v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dword v[2:3], v6 +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mul_lo_u32 v3, 8, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 9 +; GFX9-NEXT: v_add_u32_e32 v4, 2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 +; GFX9-NEXT: ds_dec_rtn_u64 v[0:1], v0, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dword v[2:3], v4, off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false) + store i32 %idx.0, i32 addrspace(1)* %add_use + store i64 %val0, i64 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind argmemonly } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -0,0 +1,1929 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=bonaire -verify-machineinstrs -stop-before=machine-scheduler < %s | FileCheck -enable-var-scope -check-prefixes=MIR %s + +; FIXME: Merge with other test. DS offset folding doesn't work due to +; register bank copies, and no return optimization is missing. + + +declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2 +declare i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* nocapture, i32, i32, i32, i1) #2 + +declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2 +declare i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* nocapture, i64, i32, i32, i1) #2 + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { +; CI-LABEL: lds_atomic_inc_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false), !noalias !0 + store i32 %result, i32 addrspace(1)* %out + ret void +} + +!0 = distinct !{!0, !"noalias-scope"} + +define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { +; CI-LABEL: lds_atomic_inc_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { +; CI-LABEL: lds_atomic_inc_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { +; CI-LABEL: lds_atomic_inc_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: v_mov_b32_e32 v1, s0 +; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 16 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s2, s2, 16 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind { +; CI-LABEL: global_atomic_inc_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { +; CI-LABEL: global_atomic_inc_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 16 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 4, v1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 4, v0 +; CI-NEXT: v_mul_lo_u32 v4, 4, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 4, v1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 4, v0 +; VI-NEXT: v_mul_lo_u32 v4, 4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 4, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id + %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32 addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 4, v1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 4, v0 +; CI-NEXT: v_mul_lo_u32 v0, 4, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 4, v1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 4, v0 +; VI-NEXT: v_mul_lo_u32 v0, 4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false) + ret void +} + +@lds0 = addrspace(3) global [512 x i32] undef, align 4 + +define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; CI-LABEL: atomic_inc_shl_base_lds_0_i32: +; CI: ; %bb.0: +; CI-NEXT: v_mul_lo_u32 v5, 4, v0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v6, 9 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0, v5 +; CI-NEXT: v_add_i32_e32 v5, vcc, 8, v5 +; CI-NEXT: ds_inc_rtn_u32 v5, v5, v6 +; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dword v[0:1], v4 +; CI-NEXT: flat_store_dword v[2:3], v5 +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_inc_shl_base_lds_0_i32: +; VI: ; %bb.0: +; VI-NEXT: v_mul_lo_u32 v5, 4, v0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0, v5 +; VI-NEXT: v_add_u32_e32 v5, vcc, 8, v5 +; VI-NEXT: ds_inc_rtn_u32 v5, v5, v6 +; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dword v[0:1], v4 +; VI-NEXT: flat_store_dword v[2:3], v5 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 9 +; GFX9-NEXT: v_add_u32_e32 v0, 0, v1 +; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 +; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: s_endpgm + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false) + store i32 %idx.0, i32 addrspace(1)* %add_use + store i32 %val0, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { +; CI-LABEL: lds_atomic_inc_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { +; CI-LABEL: lds_atomic_inc_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; CI-NEXT: s_load_dword s0, s[0:1], 0xb +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 32 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { +; CI-LABEL: lds_atomic_inc_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { +; CI-LABEL: lds_atomic_inc_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dword s0, s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v0, 42 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: lds_atomic_inc_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dword s0, s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v0, 42 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_atomic_inc_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 32 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s2, s2, 32 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind { +; CI-LABEL: global_atomic_inc_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { +; CI-LABEL: global_atomic_inc_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 32 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 8, v1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 8, v0 +; CI-NEXT: v_mul_lo_u32 v4, 8, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_mov_b32_e32 v5, 0 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 8, v1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 8, v0 +; VI-NEXT: v_mul_lo_u32 v4, 8, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, 8, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id + %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64 addrspace(1)* %out.gep + ret void +} + +define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { +; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 8, v1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 8, v0 +; CI-NEXT: v_mul_lo_u32 v0, 8, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 8, v1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 8, v0 +; VI-NEXT: v_mul_lo_u32 v0, 8, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id + %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32* %out, i32* %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32* %out, i32* %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 16 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s2, s2, 16 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32* %ptr) nounwind { +; CI-LABEL: flat_atomic_inc_noret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %ptr, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32* %ptr) nounwind { +; CI-LABEL: flat_atomic_inc_noret_i32_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 16 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i32_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i32_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 16 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %gep = getelementptr i32, i32* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32* %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 4, v1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 4, v0 +; CI-NEXT: v_mul_lo_u32 v4, 4, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 4, v1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 4, v0 +; VI-NEXT: v_mul_lo_u32 v4, 4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 4, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32* %ptr, i32 %id + %out.gep = getelementptr i32, i32* %out, i32 %id + %gep = getelementptr i32, i32* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + store i32 %result, i32* %out.gep + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 { +; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 4, v1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 4, v0 +; CI-NEXT: v_mul_lo_u32 v0, 4, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 4, v1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 4, v0 +; VI-NEXT: v_mul_lo_u32 v0, 4, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 4, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 4, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, 4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32* %ptr, i32 %id + %gep = getelementptr i32, i32* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p0i32(i32* %gep, i32 42, i32 0, i32 0, i1 false) + ret void +} + +@lds1 = addrspace(3) global [512 x i64] undef, align 8 + +define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { +; CI-LABEL: atomic_inc_shl_base_lds_0_i64: +; CI: ; %bb.0: +; CI-NEXT: v_mul_lo_u32 v7, 8, v0 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_add_i32_e32 v6, vcc, 2, v0 +; CI-NEXT: v_mov_b32_e32 v0, 9 +; CI-NEXT: v_add_i32_e32 v7, vcc, 0, v7 +; CI-NEXT: v_mov_b32_e32 v1, 0 +; CI-NEXT: v_add_i32_e32 v7, vcc, 16, v7 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v7, v[0:1] +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: v_mov_b32_e32 v4, s0 +; CI-NEXT: flat_store_dword v[2:3], v6 +; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_inc_shl_base_lds_0_i64: +; VI: ; %bb.0: +; VI-NEXT: v_mul_lo_u32 v7, 8, v0 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; VI-NEXT: v_mov_b32_e32 v0, 9 +; VI-NEXT: v_add_u32_e32 v7, vcc, 0, v7 +; VI-NEXT: v_mov_b32_e32 v1, 0 +; VI-NEXT: v_add_u32_e32 v7, vcc, 16, v7 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v7, v[0:1] +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: v_mov_b32_e32 v4, s0 +; VI-NEXT: flat_store_dword v[2:3], v6 +; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mul_lo_u32 v3, 8, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v1, 9 +; GFX9-NEXT: v_add_u32_e32 v4, 2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v0, v[1:2] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: global_store_dword v[2:3], v4, off +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: s_endpgm + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false) + store i32 %idx.0, i32 addrspace(1)* %add_use + store i64 %val0, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64* %out, i64* %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64* %out, i64* %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_addc_u32 s3, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_addc_u32 s3, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s2, s2, 32 +; GFX9-NEXT: s_addc_u32 s3, s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64* %ptr) nounwind { +; CI-LABEL: flat_atomic_inc_noret_i64: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i64: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %ptr, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64* %ptr) nounwind { +; CI-LABEL: flat_atomic_inc_noret_i64_offset: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_addc_u32 s1, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i64_offset: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i64_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_add_u32 s0, s0, 32 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %gep = getelementptr i64, i64* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 { +; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 8, v1 +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 8, v0 +; CI-NEXT: v_mul_lo_u32 v4, 8, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 +; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v1 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; CI-NEXT: v_mov_b32_e32 v5, s1 +; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: v_mov_b32_e32 v4, 42 +; CI-NEXT: v_mov_b32_e32 v5, 0 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 8, v1 +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 8, v0 +; VI-NEXT: v_mul_lo_u32 v4, 8, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 +; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v1 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; VI-NEXT: v_mov_b32_e32 v5, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: v_mov_b32_e32 v4, 42 +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v4, 8, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v3, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v4, 42 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64* %ptr, i32 %id + %out.gep = getelementptr i64, i64* %out, i32 %id + %gep = getelementptr i64, i64* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + store i64 %result, i64* %out.gep + ret void +} + +define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0 { +; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: +; CI: ; %bb.0: +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_mul_lo_u32 v2, 0, v0 +; CI-NEXT: v_mul_lo_u32 v1, 8, v1 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; CI-NEXT: v_mul_hi_u32 v3, 8, v0 +; CI-NEXT: v_mul_lo_u32 v0, 8, v0 +; CI-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; CI-NEXT: s_endpgm +; +; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: +; VI: ; %bb.0: +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_mul_lo_u32 v2, 0, v0 +; VI-NEXT: v_mul_lo_u32 v1, 8, v1 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; VI-NEXT: v_mul_hi_u32 v3, 8, v0 +; VI-NEXT: v_mul_lo_u32 v0, 8, v0 +; VI-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: v_mul_lo_u32 v2, 0, v0 +; GFX9-NEXT: v_mul_lo_u32 v1, 8, v1 +; GFX9-NEXT: v_mul_hi_u32 v3, 8, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, 8, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: s_endpgm + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64* %ptr, i32 %id + %gep = getelementptr i64, i64* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p0i64(i64* %gep, i64 42, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(3)* %ptr) #0 { +; CI-LABEL: nocse_lds_atomic_inc_ret_i32: +; CI: ; %bb.0: +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dword s0, s[0:1], 0xd +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: s_mov_b32 m0, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: v_mov_b32_e32 v3, s0 +; CI-NEXT: ds_inc_rtn_u32 v4, v3, v2 +; CI-NEXT: ds_inc_rtn_u32 v5, v3, v2 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: s_waitcnt lgkmcnt(1) +; CI-NEXT: flat_store_dword v[2:3], v4 +; CI-NEXT: s_waitcnt lgkmcnt(1) +; CI-NEXT: flat_store_dword v[0:1], v5 +; CI-NEXT: s_endpgm +; +; VI-LABEL: nocse_lds_atomic_inc_ret_i32: +; VI: ; %bb.0: +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x34 +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: ds_inc_rtn_u32 v4, v3, v2 +; VI-NEXT: ds_inc_rtn_u32 v5, v3, v2 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: s_waitcnt lgkmcnt(1) +; VI-NEXT: flat_store_dword v[2:3], v4 +; VI-NEXT: s_waitcnt lgkmcnt(1) +; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_inc_rtn_u32 v4, v1, v0 +; GFX9-NEXT: ds_inc_rtn_u32 v5, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: global_store_dword v[0:1], v4, off +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[2:3], v5, off +; GFX9-NEXT: s_endpgm + %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) + %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false) + + store i32 %result0, i32 addrspace(1)* %out0 + store i32 %result1, i32 addrspace(1)* %out1 + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind argmemonly }