Index: llvm/lib/Target/AMDGPU/AMDGPUGISel.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -226,10 +226,8 @@ def : GINodeEquiv; // FIXME: Check MMO is atomic -def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; -def : GINodeEquiv; +def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -515,10 +515,8 @@ // isa almost works but is slightly too permissive for some DS // intrinsics. if (Opc == ISD::LOAD || Opc == ISD::STORE || isa(N) || - (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC || - Opc == ISD::ATOMIC_LOAD_FADD || - Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || - Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) { + Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || + Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) { N = glueCopyToM0LDSInit(N); SelectCode(N); return; Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -504,8 +504,6 @@ TBUFFER_LOAD_FORMAT_D16, DS_ORDERED_COUNT, ATOMIC_CMP_SWAP, - ATOMIC_INC, - ATOMIC_DEC, ATOMIC_LOAD_FMIN, ATOMIC_LOAD_FMAX, BUFFER_LOAD, Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4419,8 +4419,6 @@ NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16) NODE_NAME_CASE(DS_ORDERED_COUNT) NODE_NAME_CASE(ATOMIC_CMP_SWAP) - NODE_NAME_CASE(ATOMIC_INC) - NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) NODE_NAME_CASE(BUFFER_LOAD) Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3267,9 +3267,9 @@ case TargetOpcode::G_ATOMICRMW_MAX: case TargetOpcode::G_ATOMICRMW_UMIN: case TargetOpcode::G_ATOMICRMW_UMAX: + case TargetOpcode::G_ATOMICRMW_UINC_WRAP: + case TargetOpcode::G_ATOMICRMW_UDEC_WRAP: case TargetOpcode::G_ATOMICRMW_FADD: - case AMDGPU::G_AMDGPU_ATOMIC_INC: - case AMDGPU::G_AMDGPU_ATOMIC_DEC: case AMDGPU::G_AMDGPU_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_ATOMIC_FMAX: return selectG_LOAD_STORE_ATOMICRMW(I); Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -645,6 +645,8 @@ defm atomic_load_umin : binary_atomic_op_all_as; defm atomic_load_xor : binary_atomic_op_all_as; defm atomic_load_fadd : binary_atomic_op_all_as; +defm atomic_load_uinc_wrap : binary_atomic_op_all_as; +defm atomic_load_udec_wrap : binary_atomic_op_all_as; let MemoryVT = v2f16 in defm atomic_load_fadd_v2f16 : binary_atomic_op_all_as; defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as; Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1323,7 +1323,7 @@ {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, - G_ATOMICRMW_UMIN}) + G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S64, GlobalPtr}, {S64, LocalPtr}, {S32, RegionPtr}, {S64, RegionPtr}}); @@ -4587,8 +4587,8 @@ bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B, bool IsInc) const { - unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : - AMDGPU::G_AMDGPU_ATOMIC_DEC; + unsigned Opc = IsInc ? AMDGPU::G_ATOMICRMW_UINC_WRAP : + AMDGPU::G_ATOMICRMW_UDEC_WRAP; B.buildInstr(Opc) .addDef(MI.getOperand(0).getReg()) .addUse(MI.getOperand(2).getReg()) Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4774,9 +4774,9 @@ case AMDGPU::G_ATOMICRMW_UMAX: case AMDGPU::G_ATOMICRMW_UMIN: case AMDGPU::G_ATOMICRMW_FADD: + case AMDGPU::G_ATOMICRMW_UINC_WRAP: + case AMDGPU::G_ATOMICRMW_UDEC_WRAP: case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: - case AMDGPU::G_AMDGPU_ATOMIC_INC: - case AMDGPU::G_AMDGPU_ATOMIC_DEC: case AMDGPU::G_AMDGPU_ATOMIC_FMIN: case AMDGPU::G_AMDGPU_ATOMIC_FMAX: { OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); Index: llvm/lib/Target/AMDGPU/BUFInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/BUFInstructions.td +++ llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -1489,8 +1489,8 @@ defm : BufferAtomicPat<"atomic_load_and_global", Ty, "BUFFER_ATOMIC_AND" # Suffix>; defm : BufferAtomicPat<"atomic_load_or_global", Ty, "BUFFER_ATOMIC_OR" # Suffix>; defm : BufferAtomicPat<"atomic_load_xor_global", Ty, "BUFFER_ATOMIC_XOR" # Suffix>; -defm : BufferAtomicPat<"atomic_inc_global", Ty, "BUFFER_ATOMIC_INC" # Suffix>; -defm : BufferAtomicPat<"atomic_dec_global", Ty, "BUFFER_ATOMIC_DEC" # Suffix>; +defm : BufferAtomicPat<"atomic_load_uinc_wrap_global", Ty, "BUFFER_ATOMIC_INC" # Suffix>; +defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" # Suffix>; } // end foreach Ty Index: llvm/lib/Target/AMDGPU/DSInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/DSInstructions.td +++ llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1069,8 +1069,8 @@ defm : DSAtomicRetPat_mc; defm : DSAtomicRetNoRetPat_mc; defm : DSAtomicRetNoRetPat_mc; -defm : DSAtomicRetNoRetPat_mc; -defm : DSAtomicRetNoRetPat_mc; +defm : DSAtomicRetNoRetPat_mc; +defm : DSAtomicRetNoRetPat_mc; defm : DSAtomicRetNoRetPat_mc; defm : DSAtomicRetNoRetPat_mc; defm : DSAtomicRetNoRetPat_mc; @@ -1097,8 +1097,8 @@ defm : DSAtomicRetPat_mc; defm : DSAtomicRetNoRetPat_mc; defm : DSAtomicRetNoRetPat_mc; -defm : DSAtomicRetNoRetPat_mc; -defm : DSAtomicRetNoRetPat_mc; +defm : DSAtomicRetNoRetPat_mc; +defm : DSAtomicRetNoRetPat_mc; defm : DSAtomicRetNoRetPat_mc; defm : DSAtomicRetNoRetPat_mc; defm : DSAtomicRetNoRetPat_mc; Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1153,8 +1153,8 @@ foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; defm : FlatAtomicPat <"FLAT_ATOMIC_SUB", "atomic_load_sub_"#as, i32>; -defm : FlatAtomicPat <"FLAT_ATOMIC_INC", "atomic_inc_"#as, i32>; -defm : FlatAtomicPat <"FLAT_ATOMIC_DEC", "atomic_dec_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_INC", "atomic_load_uinc_wrap_"#as, i32>; +defm : FlatAtomicPat <"FLAT_ATOMIC_DEC", "atomic_load_udec_wrap_"#as, i32>; defm : FlatAtomicPat <"FLAT_ATOMIC_AND", "atomic_load_and_"#as, i32>; defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX", "atomic_load_max_"#as, i32>; defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX", "atomic_load_umax_"#as, i32>; @@ -1167,8 +1167,8 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_X2", "atomic_load_add_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_SUB_X2", "atomic_load_sub_"#as, i64>; -defm : FlatAtomicPat <"FLAT_ATOMIC_INC_X2", "atomic_inc_"#as, i64>; -defm : FlatAtomicPat <"FLAT_ATOMIC_DEC_X2", "atomic_dec_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_INC_X2", "atomic_load_uinc_wrap_"#as, i64>; +defm : FlatAtomicPat <"FLAT_ATOMIC_DEC_X2", "atomic_load_udec_wrap_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_AND_X2", "atomic_load_and_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX_X2", "atomic_load_max_"#as, i64>; defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX_X2", "atomic_load_umax_"#as, i64>; @@ -1422,8 +1422,8 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", "atomic_inc_global", i32>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", "atomic_dec_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", "atomic_load_uinc_wrap_global", i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", "atomic_load_udec_wrap_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", "atomic_load_and_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", "atomic_load_max_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", "atomic_load_umax_global", i32>; @@ -1437,8 +1437,8 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_inc_global", i64>; -defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", "atomic_dec_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_load_uinc_wrap_global", i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", "atomic_load_udec_wrap_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", "atomic_load_and_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", "atomic_load_max_global", i64>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", "atomic_load_umax_global", i64>; Index: llvm/lib/Target/AMDGPU/R600ISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/R600ISelLowering.h +++ llvm/lib/Target/AMDGPU/R600ISelLowering.h @@ -113,6 +113,9 @@ SelectionDAG &DAG) const; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; + + TargetLowering::AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override; }; } // End namespace llvm; Index: llvm/lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/R600ISelLowering.cpp +++ llvm/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -2158,3 +2158,18 @@ return Node; } + +TargetLowering::AtomicExpansionKind +R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + switch (RMW->getOperation()) { + case AtomicRMWInst::UIncWrap: + case AtomicRMWInst::UDecWrap: + // FIXME: Cayman at least appears to have instructions for this, but the + // instruction defintions appear to be missing. + return AtomicExpansionKind::CmpXChg; + default: + break; + } + + return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); +} Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -767,6 +767,8 @@ ISD::ATOMIC_LOAD_UMIN, ISD::ATOMIC_LOAD_UMAX, ISD::ATOMIC_LOAD_FADD, + ISD::ATOMIC_LOAD_UINC_WRAP, + ISD::ATOMIC_LOAD_UDEC_WRAP, ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN}); @@ -7269,6 +7271,8 @@ M->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_fadd: { MemSDNode *M = cast(Op); unsigned Opc; @@ -7276,24 +7280,28 @@ case Intrinsic::amdgcn_ds_fadd: Opc = ISD::ATOMIC_LOAD_FADD; break; + case Intrinsic::amdgcn_atomic_inc: + Opc = ISD::ATOMIC_LOAD_UINC_WRAP; + break; + case Intrinsic::amdgcn_atomic_dec: + Opc = ISD::ATOMIC_LOAD_UDEC_WRAP; + break; } return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), M->getOperand(0), M->getOperand(2), M->getOperand(3), M->getMemOperand()); } - case Intrinsic::amdgcn_atomic_inc: - case Intrinsic::amdgcn_atomic_dec: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast(Op); unsigned Opc; switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: - Opc = AMDGPUISD::ATOMIC_INC; + Opc = ISD::ATOMIC_LOAD_UINC_WRAP; break; case Intrinsic::amdgcn_atomic_dec: - Opc = AMDGPUISD::ATOMIC_DEC; + Opc = ISD::ATOMIC_LOAD_UDEC_WRAP; break; case Intrinsic::amdgcn_ds_fmin: Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; @@ -12714,8 +12722,6 @@ return AMDGPU::isIntrinsicSourceOfDivergence( cast(N->getOperand(1))->getZExtValue()); case AMDGPUISD::ATOMIC_CMP_SWAP: - case AMDGPUISD::ATOMIC_INC: - case AMDGPUISD::ATOMIC_DEC: case AMDGPUISD::ATOMIC_LOAD_FMIN: case AMDGPUISD::ATOMIC_LOAD_FMAX: case AMDGPUISD::BUFFER_ATOMIC_SWAP: Index: llvm/lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -50,14 +50,6 @@ [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain, SDNPInGlue] >; -def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - -def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2, - [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] ->; - def SDTAtomic2_f32 : SDTypeProfile<1, 2, [ SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1> ]>; @@ -353,8 +345,6 @@ // PatFrags for global memory operations //===----------------------------------------------------------------------===// -defm atomic_inc : binary_atomic_op_all_as; -defm atomic_dec : binary_atomic_op_all_as; defm atomic_load_fmin : binary_atomic_op_all_as; defm atomic_load_fmax : binary_atomic_op_all_as; @@ -766,8 +756,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; -defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>; -defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; +defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">; +defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">; defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; Index: llvm/lib/Target/AMDGPU/SIInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SIInstructions.td +++ llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3339,8 +3339,6 @@ } let Namespace = "AMDGPU" in { -def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP; -def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP; def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP; def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP; } Index: llvm/test/CodeGen/AMDGPU/flat_atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat_atomics.ll +++ llvm/test/CodeGen/AMDGPU/flat_atomics.ll @@ -1274,3 +1274,209 @@ store atomic half %in, half* %out seq_cst, align 2 ret void } + +; GCN-LABEL: {{^}}atomic_inc_i32_offset: +; CIVI: flat_atomic_inc v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX9: flat_atomic_inc v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16{{$}} +define amdgpu_kernel void @atomic_inc_i32_offset(i32* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile inc i32* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_max_offset: +; CIVI: flat_atomic_inc v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX9: flat_atomic_inc v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:4092{{$}} +define amdgpu_kernel void @atomic_inc_i32_max_offset(i32* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32* %out, i32 1023 + %val = atomicrmw volatile inc i32* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_max_offset_p1: +; GCN: flat_atomic_inc v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(i32* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32* %out, i32 1024 + %val = atomicrmw volatile inc i32* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_ret_offset: +; CIVI: flat_atomic_inc [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX9: flat_atomic_inc [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_inc_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile inc i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_incr64_offset: +; CIVI: flat_atomic_inc v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} +; GFX9: flat_atomic_inc v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} +define amdgpu_kernel void @atomic_inc_i32_incr64_offset(i32* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile inc i32* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_ret_incr64_offset: +; CIVI: flat_atomic_inc [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX9: flat_atomic_inc [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile inc i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32: +; GCN: flat_atomic_inc v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} +define amdgpu_kernel void @atomic_inc_i32(i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile inc i32* %out, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_ret: +; GCN: flat_atomic_inc [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_inc_i32_ret(i32* %out, i32* %out2, i32 %in) { +entry: + %val = atomicrmw volatile inc i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_incr64: +; GCN: flat_atomic_inc v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} +define amdgpu_kernel void @atomic_inc_i32_incr64(i32* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile inc i32* %ptr, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_ret_incr64: +; GCN: flat_atomic_inc [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_inc_i32_ret_incr64(i32* %out, i32* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile inc i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_offset: +; CIVI: flat_atomic_dec v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX9: flat_atomic_dec v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16{{$}} +define amdgpu_kernel void @atomic_dec_i32_offset(i32* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile dec i32* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_max_offset: +; CIVI: flat_atomic_dec v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +; GFX9: flat_atomic_dec v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:4092{{$}} +define amdgpu_kernel void @atomic_dec_i32_max_offset(i32* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32* %out, i32 1023 + %val = atomicrmw volatile dec i32* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_max_offset_p1: +; GCN: flat_atomic_dec v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}} +define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(i32* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32* %out, i32 1024 + %val = atomicrmw volatile dec i32* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_ret_offset: +; CIVI: flat_atomic_dec [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX9: flat_atomic_dec [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_dec_i32_ret_offset(i32* %out, i32* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32* %out, i32 4 + %val = atomicrmw volatile dec i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_decr64_offset: +; CIVI: flat_atomic_dec v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} +; GFX9: flat_atomic_dec v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}} +define amdgpu_kernel void @atomic_dec_i32_decr64_offset(i32* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile dec i32* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_ret_decr64_offset: +; CIVI: flat_atomic_dec [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GFX9: flat_atomic_dec [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32* %out, i64 %index + %gep = getelementptr i32, i32* %ptr, i32 4 + %val = atomicrmw volatile dec i32* %gep, i32 %in seq_cst + store i32 %val, i32* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32: +; GCN: flat_atomic_dec v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} +define amdgpu_kernel void @atomic_dec_i32(i32* %out, i32 %in) { +entry: + %val = atomicrmw volatile dec i32* %out, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_ret: +; GCN: flat_atomic_dec [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_dec_i32_ret(i32* %out, i32* %out2, i32 %in) { +entry: + %val = atomicrmw volatile dec i32* %out, i32 %in seq_cst + store i32 %val, i32* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_decr64: +; GCN: flat_atomic_dec v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} +define amdgpu_kernel void @atomic_dec_i32_decr64(i32* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile dec i32* %ptr, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_ret_decr64: +; GCN: flat_atomic_dec [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_dec_i32_ret_decr64(i32* %out, i32* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32* %out, i64 %index + %val = atomicrmw volatile dec i32* %ptr, i32 %in seq_cst + store i32 %val, i32* %out2 + ret void +} Index: llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -1071,3 +1071,163 @@ store atomic double %in, double* %ptr seq_cst, align 8 ret void } + +; GCN-LABEL: {{^}}atomic_inc_i64_offset: +; GCN: flat_atomic_inc_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} +define amdgpu_kernel void @atomic_inc_i64_offset(i64* %out, i64 %in) { +entry: + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile inc i64* %gep, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i64_ret_offset: +; GCN: flat_atomic_inc_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_inc_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { +entry: + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile inc i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i64_incr64_offset: +; GCN: flat_atomic_inc_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} +define amdgpu_kernel void @atomic_inc_i64_incr64_offset(i64* %out, i64 %in, i64 %index) { +entry: + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile inc i64* %gep, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i64_ret_incr64_offset: +; GCN: flat_atomic_inc_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { +entry: + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile inc i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i64: +; GCN: flat_atomic_inc_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} +define amdgpu_kernel void @atomic_inc_i64(i64* %out, i64 %in) { +entry: + %tmp0 = atomicrmw volatile inc i64* %out, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i64_ret: +; GCN: flat_atomic_inc_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_inc_i64_ret(i64* %out, i64* %out2, i64 %in) { +entry: + %tmp0 = atomicrmw volatile inc i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i64_incr64: +; GCN: flat_atomic_inc_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} +define amdgpu_kernel void @atomic_inc_i64_incr64(i64* %out, i64 %in, i64 %index) { +entry: + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile inc i64* %ptr, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i64_ret_incr64: +; GCN: flat_atomic_inc_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_inc_i64_ret_incr64(i64* %out, i64* %out2, i64 %in, i64 %index) { +entry: + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile inc i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64_offset: +; GCN: flat_atomic_dec_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} +define amdgpu_kernel void @atomic_dec_i64_offset(i64* %out, i64 %in) { +entry: + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile dec i64* %gep, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64_ret_offset: +; GCN: flat_atomic_dec_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_dec_i64_ret_offset(i64* %out, i64* %out2, i64 %in) { +entry: + %gep = getelementptr i64, i64* %out, i64 4 + %tmp0 = atomicrmw volatile dec i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64_decr64_offset: +; GCN: flat_atomic_dec_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} +define amdgpu_kernel void @atomic_dec_i64_decr64_offset(i64* %out, i64 %in, i64 %index) { +entry: + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile dec i64* %gep, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64_ret_decr64_offset: +; GCN: flat_atomic_dec_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(i64* %out, i64* %out2, i64 %in, i64 %index) { +entry: + %ptr = getelementptr i64, i64* %out, i64 %index + %gep = getelementptr i64, i64* %ptr, i64 4 + %tmp0 = atomicrmw volatile dec i64* %gep, i64 %in seq_cst + store i64 %tmp0, i64* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64: +; GCN: flat_atomic_dec_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} +define amdgpu_kernel void @atomic_dec_i64(i64* %out, i64 %in) { +entry: + %tmp0 = atomicrmw volatile dec i64* %out, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64_ret: +; GCN: flat_atomic_dec_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_dec_i64_ret(i64* %out, i64* %out2, i64 %in) { +entry: + %tmp0 = atomicrmw volatile dec i64* %out, i64 %in seq_cst + store i64 %tmp0, i64* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64_decr64: +; GCN: flat_atomic_dec_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} +define amdgpu_kernel void @atomic_dec_i64_decr64(i64* %out, i64 %in, i64 %index) { +entry: + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile dec i64* %ptr, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64_ret_decr64: +; GCN: flat_atomic_dec_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} +; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] +define amdgpu_kernel void @atomic_dec_i64_ret_decr64(i64* %out, i64* %out2, i64 %in, i64 %index) { +entry: + %ptr = getelementptr i64, i64* %out, i64 %index + %tmp0 = atomicrmw volatile dec i64* %ptr, i64 %in seq_cst + store i64 %tmp0, i64* %out2 + ret void +} Index: llvm/test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics.ll @@ -1437,3 +1437,185 @@ store atomic half %in, half addrspace(1)* %out seq_cst, align 2 ret void } + +; GCN-LABEL: {{^}}atomic_inc_i32_offset: +; SIVI: buffer_atomic_inc v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} +define amdgpu_kernel void @atomic_inc_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile inc i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_max_neg_offset: +; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}} +define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 -1024 + %val = atomicrmw volatile inc i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_soffset: +; SIVI: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0 +; SIVI: buffer_atomic_inc v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}} + +; GFX9: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8000{{$}} +; GFX9: global_atomic_inc [[OFFSET]], v{{[0-9]+}}, s{{\[[0-9]:[0-9]+\]}} offset:3232{{$}} +define amdgpu_kernel void @atomic_inc_i32_soffset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000 + %val = atomicrmw volatile inc i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_huge_offset: +; SI-DAG: v_mov_b32_e32 v[[PTRLO:[0-9]+]], 0xdeac +; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd +; SI: buffer_atomic_inc v{{[0-9]+}}, v[[[PTRLO]]:[[PTRHI]]], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} + +; VI: flat_atomic_inc + +; GFX9: s_add_u32 s[[LOW_K:[0-9]+]], s{{[0-9]+}}, 0xdeac +; GFX9: s_addc_u32 s[[HIGH_K:[0-9]+]], s{{[0-9]+}}, 0xabcd +; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, s[[[LOW_K]]:[[HIGH_K]]]{{$}} +define amdgpu_kernel void @atomic_inc_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595 + + %val = atomicrmw volatile inc i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_ret_offset: +; SIVI: buffer_atomic_inc [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} +; SIVI: buffer_store_dword [[RET]] + +; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} +define amdgpu_kernel void @atomic_inc_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile inc i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %val, i32 addrspace(1)* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_addr64_offset: +; SI: buffer_atomic_inc v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +; VI: flat_atomic_inc v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} +; GFX9: global_atomic_inc v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} +define amdgpu_kernel void @atomic_inc_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile inc i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i32_ret_addr64_offset: +; SI: buffer_atomic_inc [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; VI: flat_atomic_inc [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} +; SIVI: buffer_store_dword [[RET]] + +; GFX9: global_atomic_inc [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, [[RET]], s +define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile inc i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %val, i32 addrspace(1)* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_offset: +; SIVI: buffer_atomic_dec v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}} +; GFX9: global_atomic_dec v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16{{$}} +define amdgpu_kernel void @atomic_dec_i32_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile dec i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_max_neg_offset: +; GFX9: global_atomic_dec v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}} +define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 -1024 + %val = atomicrmw volatile dec i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_soffset: +; SIVI: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0 +; SIVI: buffer_atomic_dec v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}} + +; GFX9: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8000{{$}} +; GFX9: global_atomic_dec [[OFFSET]], v{{[0-9]+}}, s{{\[[0-9]:[0-9]+\]}} offset:3232{{$}} +define amdgpu_kernel void @atomic_dec_i32_soffset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000 + %val = atomicrmw volatile dec i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_huge_offset: +; SI-DAG: v_mov_b32_e32 v[[PTRLO:[0-9]+]], 0xdeac +; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd +; SI: buffer_atomic_dec v{{[0-9]+}}, v[[[PTRLO]]:[[PTRHI]]], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}} + +; VI: flat_atomic_dec + +; GFX9: s_add_u32 s[[LOW_K:[0-9]+]], s{{[0-9]+}}, 0xdeac +; GFX9: s_addc_u32 s[[HIGH_K:[0-9]+]], s{{[0-9]+}}, 0xabcd +; GFX9: global_atomic_dec v{{[0-9]+}}, v{{[0-9]+}}, s[[[LOW_K]]:[[HIGH_K]]]{{$}} +define amdgpu_kernel void @atomic_dec_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595 + + %val = atomicrmw volatile dec i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_ret_offset: +; SIVI: buffer_atomic_dec [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}} +; SIVI: buffer_store_dword [[RET]] + +; GFX9: global_atomic_dec v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}} +define amdgpu_kernel void @atomic_dec_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) { +entry: + %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4 + %val = atomicrmw volatile dec i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %val, i32 addrspace(1)* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_addr64_offset: +; SI: buffer_atomic_dec v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}} +; VI: flat_atomic_dec v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}} +; GFX9: global_atomic_dec v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16{{$}} +define amdgpu_kernel void @atomic_dec_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile dec i32 addrspace(1)* %gep, i32 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i32_ret_addr64_offset: +; SI: buffer_atomic_dec [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}} +; VI: flat_atomic_dec [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}} +; SIVI: buffer_store_dword [[RET]] + +; GFX9: global_atomic_dec [[RET:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}] offset:16 glc{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, [[RET]], s +define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) { +entry: + %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4 + %val = atomicrmw volatile dec i32 addrspace(1)* %gep, i32 %in seq_cst + store i32 %val, i32 addrspace(1)* %out2 + ret void +} Index: llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -1247,3 +1247,75 @@ store atomic double %in, double addrspace(1)* %gep seq_cst, align 8 ret void } + +; GCN-LABEL: {{^}}atomic_inc_i64_offset: +; CIVI: buffer_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} + +; GFX9: global_atomic_inc_x2 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}} +define amdgpu_kernel void @atomic_inc_i64_offset(i64 addrspace(1)* %out, i64 %in) { +entry: + %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 + %tmp0 = atomicrmw volatile inc i64 addrspace(1)* %gep, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i64_ret_offset: +; CIVI: buffer_atomic_inc_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} +; CIVI: buffer_store_dwordx2 [[RET]] + +; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +define amdgpu_kernel void @atomic_inc_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +entry: + %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 + %tmp0 = atomicrmw volatile inc i64 addrspace(1)* %gep, i64 %in seq_cst + store i64 %tmp0, i64 addrspace(1)* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_inc_i64_incr64_offset: +; CI: buffer_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} +; VI: flat_atomic_inc_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9: global_atomic_inc_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} +define amdgpu_kernel void @atomic_inc_i64_incr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +entry: + %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 + %tmp0 = atomicrmw volatile inc i64 addrspace(1)* %gep, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64_offset: +; CIVI: buffer_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} + +; GFX9: global_atomic_dec_x2 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}} +define amdgpu_kernel void @atomic_dec_i64_offset(i64 addrspace(1)* %out, i64 %in) { +entry: + %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 + %tmp0 = atomicrmw volatile dec i64 addrspace(1)* %gep, i64 %in seq_cst + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64_ret_offset: +; CIVI: buffer_atomic_dec_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} +; CIVI: buffer_store_dwordx2 [[RET]] + +; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} +define amdgpu_kernel void @atomic_dec_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) { +entry: + %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4 + %tmp0 = atomicrmw volatile dec i64 addrspace(1)* %gep, i64 %in seq_cst + store i64 %tmp0, i64 addrspace(1)* %out2 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_i64_decr64_offset: +; CI: buffer_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} +; VI: flat_atomic_dec_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9: global_atomic_dec_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} +define amdgpu_kernel void @atomic_dec_i64_decr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) { +entry: + %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4 + %tmp0 = atomicrmw volatile dec i64 addrspace(1)* %gep, i64 %in seq_cst + ret void +} Index: llvm/test/CodeGen/AMDGPU/local-atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/local-atomics.ll +++ llvm/test/CodeGen/AMDGPU/local-atomics.ll @@ -728,3 +728,103 @@ %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst ret void } + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; EG: LDS_CMPST +; GCN: ds_inc_rtn_u32 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw uinc_wrap i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; EG: LDS_CMPST +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw uinc_wrap i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_inc_u32 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw uinc_wrap i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_inc_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw uinc_wrap i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; EG: LDS_CMPST +; GCN: ds_dec_rtn_u32 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw udec_wrap i32 addrspace(3)* %ptr, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; EG: LDS_CMPST +; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw udec_wrap i32 addrspace(3)* %gep, i32 4 seq_cst + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_dec_u32 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = atomicrmw udec_wrap i32 addrspace(3)* %ptr, i32 4 seq_cst + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_dec_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = atomicrmw udec_wrap i32 addrspace(3)* %gep, i32 4 seq_cst + ret void +} Index: llvm/test/CodeGen/AMDGPU/local-atomics64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/local-atomics64.ll +++ llvm/test/CodeGen/AMDGPU/local-atomics64.ll @@ -639,3 +639,121 @@ %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst ret void } + +; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_inc_rtn_u64 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw uinc_wrap i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64_offset: +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; SI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVDATA]]:[[HIVDATA]]] offset:32 +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 + %result = atomicrmw uinc_wrap i64 addrspace(3)* %gep, i64 9 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_inc1_ret_i64: +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}} +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} +; GCN: ds_inc_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]] +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_inc1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw uinc_wrap i64 addrspace(3)* %ptr, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_inc1_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_inc_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_inc1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw uinc_wrap i64 addrspace(3)* %gep, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_dec_rtn_u64 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw udec_wrap i64 addrspace(3)* %ptr, i64 4 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64_offset: +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; SI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb +; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9 +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0 +; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] +; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v[[[LOVDATA]]:[[HIVDATA]]] offset:32 +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4 + %result = atomicrmw udec_wrap i64 addrspace(3)* %gep, i64 9 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_dec1_ret_i64: +; SICIVI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN-DAG: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 1{{$}} +; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}} +; GCN: ds_dec_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, v[[[LOVDATA]]:[[HIVDATA]]] +; GCN: buffer_store_dwordx2 [[RESULT]], +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_dec1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %result = atomicrmw udec_wrap i64 addrspace(3)* %ptr, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_dec1_ret_i64_offset: +; SICIVI: s_mov_b32 m0 +; GFX9-NOT: m0 + +; GCN: ds_dec_rtn_u64 {{.*}} offset:32 +; GCN: s_endpgm +define amdgpu_kernel void @lds_atomic_dec1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = atomicrmw udec_wrap i64 addrspace(3)* %gep, i64 1 seq_cst + store i64 %result, i64 addrspace(1)* %out, align 8 + ret void +} Index: llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -537,7 +537,7 @@ define i32 @atomicrmw_inc_private_i32(i32 addrspace(5)* %ptr) { ; IR-LABEL: @atomicrmw_inc_private_i32( ; IR-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(5)* [[PTR:%.*]], align 4 -; IR-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 0 +; IR-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 1 ; IR-NEXT: [[TMP3:%.*]] = icmp uge i32 [[TMP1]], 4 ; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] ; IR-NEXT: store i32 [[NEW]], i32 addrspace(5)* [[PTR]], align 4 @@ -548,13 +548,14 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v1 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc, 4, v1 -; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v1, vcc +; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw inc i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw uinc_wrap i32 addrspace(5)* %ptr, i32 4 seq_cst ret i32 %result } @@ -583,6 +584,6 @@ ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] - %result = atomicrmw dec i32 addrspace(5)* %ptr, i32 4 seq_cst + %result = atomicrmw udec_wrap i32 addrspace(5)* %ptr, i32 4 seq_cst ret i32 %result } Index: llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -280,6 +280,34 @@ ret void } +; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0: +; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; GCN: s_endpgm +define amdgpu_kernel void @atomic_inc_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw uinc_wrap i32 addrspace(3)* %arrayidx0, i32 31 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + +; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0: +; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +; GCN: s_endpgm +define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0 + %val = atomicrmw udec_wrap i32 addrspace(3)* %arrayidx0, i32 31 seq_cst + store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %idx.0, i32 addrspace(1)* %add_use, align 4 + ret void +} + ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_lds: ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0