Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -151,6 +151,17 @@ [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem] >; +// TODO: Do we want an ordering for these? +def int_amdgcn_atomic_inc : Intrinsic<[llvm_anyint_ty], + [llvm_anyptr_ty, LLVMMatchType<0>], + [IntrReadWriteArgMem, NoCapture<0>] +>; + +def int_amdgcn_atomic_dec : Intrinsic<[llvm_anyint_ty], + [llvm_anyptr_ty, LLVMMatchType<0>], + [IntrReadWriteArgMem, NoCapture<0>] +>; + class AMDGPUImageLoad : Intrinsic < [llvm_v4f32_ty], // vdata(VGPR) [llvm_anyint_ty, // vaddr(VGPR) Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -334,7 +334,8 @@ return nullptr; // Already selected. } - if (isa(N)) + if (isa(N) || + (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC)) N = glueCopyToM0(N); switch (Opc) { Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -313,6 +313,8 @@ STORE_MSKOR, LOAD_CONSTANT, TBUFFER_STORE_FORMAT, + ATOMIC_INC, + ATOMIC_DEC, LAST_AMDGPU_ISD_NUMBER }; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2812,6 +2812,8 @@ NODE_NAME_CASE(INTERP_P2) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) + NODE_NAME_CASE(ATOMIC_INC) + NODE_NAME_CASE(ATOMIC_DEC) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } return nullptr; Index: lib/Target/AMDGPU/CIInstructions.td =================================================================== --- lib/Target/AMDGPU/CIInstructions.td +++ lib/Target/AMDGPU/CIInstructions.td @@ -314,8 +314,10 @@ >; def : FlatAtomicPat ; -def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; def : FlatAtomicPat ; @@ -324,4 +326,7 @@ def : FlatAtomicPat ; def : FlatAtomicPat ; +def : FlatAtomicPat ; +def : FlatAtomicPat ; + } // End Predicates = [isCIVI] Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -30,6 +30,7 @@ MVT VT, unsigned Offset) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; @@ -65,6 +66,9 @@ public: SITargetLowering(TargetMachine &tm, const AMDGPUSubtarget &STI); + bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, + unsigned IntrinsicID) const override; + bool isShuffleMaskLegal(const SmallVectorImpl &/*Mask*/, EVT /*VT*/) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -128,6 +128,8 @@ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i32, Expand); @@ -297,6 +299,25 @@ // TargetLowering queries //===----------------------------------------------------------------------===// +bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, + const CallInst &CI, + unsigned IntrID) const { + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.vol = false; + Info.readMem = true; + Info.writeMem = true; + return true; + default: + return false; + } +} + bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl &, EVT) const { // SI has some legal vector types, but no legal vector operations. Say no @@ -1163,6 +1184,7 @@ return LowerGlobalAddress(MFI, Op, DAG); } case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); } return SDValue(); @@ -1628,6 +1650,29 @@ } } +SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntrID = cast(Op.getOperand(1))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: { + MemSDNode *M = cast(Op); + unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ? + AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC; + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + + return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops, + M->getMemoryVT(), M->getMemOperand()); + } + default: + return SDValue(); + } +} + SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); @@ -2606,7 +2651,9 @@ case ISD::ATOMIC_LOAD_MIN: case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: - case ISD::ATOMIC_LOAD_UMAX: { // TODO: Target mem intrinsics. + case ISD::ATOMIC_LOAD_UMAX: + case AMDGPUISD::ATOMIC_INC: + case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; Index: lib/Target/AMDGPU/SIInstrInfo.td =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.td +++ lib/Target/AMDGPU/SIInstrInfo.td @@ -95,6 +95,14 @@ [SDNPMayLoad, SDNPMemOperand] >; +def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + +def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2, + [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain] +>; + def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT", SDTypeProfile<0, 13, [SDTCisVT<0, v4i32>, // rsrc(SGPR) @@ -173,6 +181,13 @@ }]>; //===----------------------------------------------------------------------===// +// PatFrags for global memory operations +//===----------------------------------------------------------------------===// + +def atomic_inc_global : global_binary_atomic_op; +def atomic_dec_global : global_binary_atomic_op; + +//===----------------------------------------------------------------------===// // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1 // to be glued to the memory instructions. //===----------------------------------------------------------------------===// @@ -271,9 +286,10 @@ return isCBranchSCC(N); }]>; -multiclass SIAtomicM0Glue2 { +multiclass SIAtomicM0Glue2 { - def _glue : SDNode <"ISD::ATOMIC_"#op_name, SDTAtomic2, + def _glue : SDNode < + !if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, SDTAtomic2, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue] >; @@ -281,11 +297,13 @@ } defm si_atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">; +defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; +defm si_atomic_inc : SIAtomicM0Glue2 <"INC", 1>; +defm si_atomic_dec : SIAtomicM0Glue2 <"DEC", 1>; defm si_atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">; defm si_atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">; defm si_atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">; defm si_atomic_load_or : SIAtomicM0Glue2 <"LOAD_OR">; -defm si_atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">; defm si_atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">; defm si_atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">; defm si_atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">; Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -1042,8 +1042,13 @@ defm BUFFER_ATOMIC_XOR : MUBUF_Atomic < mubuf<0x3b, 0x4a>, "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global >; -//def BUFFER_ATOMIC_INC : MUBUF_ , "buffer_atomic_inc", []>; -//def BUFFER_ATOMIC_DEC : MUBUF_ , "buffer_atomic_dec", []>; +defm BUFFER_ATOMIC_INC : MUBUF_Atomic < + mubuf<0x3c, 0x4b>, "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global +>; +defm BUFFER_ATOMIC_DEC : MUBUF_Atomic < + mubuf<0x3d, 0x4c>, "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global +>; + //def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ , "buffer_atomic_fcmpswap", []>; // isn't on VI //def BUFFER_ATOMIC_FMIN : MUBUF_ , "buffer_atomic_fmin", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX : MUBUF_ , "buffer_atomic_fmax", []>; // isn't on VI @@ -1059,8 +1064,12 @@ //def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 , "buffer_atomic_and_x2", []>; //def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 , "buffer_atomic_or_x2", []>; //def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 , "buffer_atomic_xor_x2", []>; -//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 , "buffer_atomic_inc_x2", []>; -//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 , "buffer_atomic_dec_x2", []>; +defm BUFFER_ATOMIC_INC_X2 : MUBUF_Atomic < + mubuf<0x5c, 0x6b>, "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global +>; +defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Atomic < + mubuf<0x5d, 0x6c>, "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global +>; //def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 , "buffer_atomic_fcmpswap_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 , "buffer_atomic_fmin_x2", []>; // isn't on VI //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 , "buffer_atomic_fmax_x2", []>; // isn't on VI @@ -3071,6 +3080,8 @@ def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; @@ -3078,13 +3089,14 @@ def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; - def : DSAtomicCmpXChg; // 64-bit atomics. def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; +def : DSAtomicRetPat; +def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; def : DSAtomicRetPat; Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -0,0 +1,251 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2 +declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2 + +declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2 +declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2 + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] +define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 +define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32: +; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN: ds_dec_u32 [[VPTR]], [[DATA]] +define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16 +define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_dec_ret_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: buffer_atomic_dec [[K]], s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} +define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: buffer_atomic_dec [[K]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} +define void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32: +; GCN: buffer_atomic_dec [[K]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +define void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) + ret void +} + +; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32_offset: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: buffer_atomic_dec [[K]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} +define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset_addr64: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}} +; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} +define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id + %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + store i32 %result, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}global_atomic_dec_noret_i32_offset_addr64: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} +; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} +define void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + ret void +} + +@lds0 = addrspace(3) global [512 x i32] undef + +; SI-LABEL: {{^}}atomic_dec_shl_base_lds_0: +; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; SI: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]] offset:8 +define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9) + store i32 %idx.0, i32 addrspace(1)* %add_use + store i32 %val0, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} +define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_dec_ret_i64_offset: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 +define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} +define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i64_offset: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}} +define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_dec_ret_i64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} +define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} +define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_atomic_dec_noret_i64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) + ret void +} + +; FUNC-LABEL: {{^}}global_atomic_dec_noret_i64_offset: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} +define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} +; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} +define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id + %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + store i64 %result, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} +; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} +define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id + %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + ret void +} + +@lds1 = addrspace(3) global [512 x i64] undef, align 8 + +; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64: +; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} +; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 +define void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9) + store i32 %idx.0, i32 addrspace(1)* %add_use + store i64 %val0, i64 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind argmemonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -0,0 +1,251 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s + +declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2 +declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2 + +declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2 +declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2 + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] +define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16 +define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32: +; GCN: s_load_dword [[SPTR:s[0-9]+]], +; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4 +; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]] +; GCN: ds_inc_u32 [[VPTR]], [[DATA]] +define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42) + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16 +define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_inc_ret_i32: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: buffer_atomic_inc [[K]], s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} +define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: buffer_atomic_inc [[K]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}} +define void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + store i32 %result, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32: +; GCN: buffer_atomic_inc [[K]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +define void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind { + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42) + ret void +} + +; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32_offset: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; GCN: buffer_atomic_inc [[K]], s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}} +define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind { + %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset_addr64: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}} +; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}} +define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id + %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + store i32 %result, i32 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}global_atomic_inc_noret_i32_offset_addr64: +; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42 +; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}} +; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}} +define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id + %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5 + %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42) + ret void +} + +@lds0 = addrspace(3) global [512 x i32] undef, align 4 + +; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32: +; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}} +; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8 +define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9) + store i32 %idx.0, i32 addrspace(1)* %add_use + store i32 %val0, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} +define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}lds_atomic_inc_ret_i64_offset: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32 +define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} +define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42) + ret void +} + +; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i64_offset: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}} +define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_inc_ret_i64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}} +define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}} +define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + store i64 %result, i64 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}global_atomic_inc_noret_i64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind { + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42) + ret void +} + +; FUNC-LABEL: {{^}}global_atomic_inc_noret_i64_offset: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}} +define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind { + %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + ret void +} + +; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} +; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} +define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id + %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id + %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + store i64 %result, i64 addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64: +; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 +; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} +; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} +; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} +define void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { + %id = call i32 @llvm.amdgcn.workitem.id.x() + %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id + %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5 + %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42) + ret void +} + +@lds1 = addrspace(3) global [512 x i64] undef, align 8 + +; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64: +; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}} +; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16 +define void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0 + %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9) + store i32 %idx.0, i32 addrspace(1)* %add_use + store i64 %val0, i64 addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind argmemonly } Index: test/CodeGen/AMDGPU/local-atomics.ll =================================================================== --- test/CodeGen/AMDGPU/local-atomics.ll +++ test/CodeGen/AMDGPU/local-atomics.ll @@ -324,7 +324,6 @@ ret void } -; XXX - Is it really necessary to load 4 into VGPR? ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32: ; GCN: s_load_dword [[SPTR:s[0-9]+]], ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4