Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -406,9 +406,20 @@ [NoCapture<0>] >; +class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< + [llvm_i32_ty], + [llvm_anyptr_ty, // LDS or GDS ptr + llvm_i1_ty], // isVolatile + [IntrConvergent, IntrArgMemOnly, NoCapture<0>] +>; + def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; +// The pointer argument is assumed to be dynamically uniform if a VGPR. +def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; +def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; + def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">; def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">; def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">; Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -106,12 +106,13 @@ MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; - SDNode *glueCopyToM0(SDNode *N) const; + SDNode *glueCopyToM0LDSInit(SDNode *N) const; + SDNode *glueCopyToM0(SDNode *N, SDValue Val) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset); virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset); - bool isDSOffsetLegal(const SDValue &Base, unsigned Offset, + bool isDSOffsetLegal(SDValue Base, unsigned Offset, unsigned OffsetBits) const; bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const; bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0, @@ -209,6 +210,7 @@ void SelectBRCOND(SDNode *N); void SelectFMAD_FMA(SDNode *N); void SelectATOMIC_CMP_SWAP(SDNode *N); + void SelectINTRINSIC_W_CHAIN(SDNode *N); protected: // Include the pieces autogenerated from the target description. @@ -339,29 +341,32 @@ } } -SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const { - if (cast(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS || - !Subtarget->ldsRequiresM0Init()) - return N; - +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const { const SITargetLowering& Lowering = - *static_cast(getTargetLowering()); + *static_cast(getTargetLowering()); // Write max value to m0 before each load operation SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N), - CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); + Val); SDValue Glue = M0.getValue(1); SmallVector Ops; - for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { - Ops.push_back(N->getOperand(i)); - } + for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) + Ops.push_back(N->getOperand(i)); + Ops.push_back(Glue); return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } +SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const { + if (cast(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS || + !Subtarget->ldsRequiresM0Init()) + return N; + return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32)); +} + MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, EVT VT) const { SDNode *Lo = CurDAG->getMachineNode( @@ -472,7 +477,7 @@ Opc == ISD::ATOMIC_LOAD_FADD || Opc == AMDGPUISD::ATOMIC_LOAD_FMIN || Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) - N = glueCopyToM0(N); + N = glueCopyToM0LDSInit(N); switch (Opc) { default: @@ -570,7 +575,7 @@ case ISD::STORE: case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: { - N = glueCopyToM0(N); + N = glueCopyToM0LDSInit(N); break; } @@ -648,6 +653,12 @@ SelectCode(N); return; } + + break; + } + case ISD::INTRINSIC_W_CHAIN: { + SelectINTRINSIC_W_CHAIN(N); + return; } } @@ -828,7 +839,7 @@ CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); } -bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset, +bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset, unsigned OffsetBits) const { if ((OffsetBits == 16 && !isUInt<16>(Offset)) || (OffsetBits == 8 && !isUInt<8>(Offset))) @@ -1760,6 +1771,52 @@ CurDAG->RemoveDeadNode(N); } +void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) { + unsigned IntrID = cast(N->getOperand(1))->getZExtValue(); + if ((IntrID != Intrinsic::amdgcn_ds_append && + IntrID != Intrinsic::amdgcn_ds_consume) || + N->getValueType(0) != MVT::i32) { + SelectCode(N); + return; + } + + // The address is assumed to be uniform, so if it ends up in a VGPR, it will + // be copied to an SGPR with readfirstlane. + unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ? + AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME; + + SDValue Chain = N->getOperand(0); + SDValue Ptr = N->getOperand(2); + MemIntrinsicSDNode *M = cast(N); + bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS; + + SDValue Offset; + if (CurDAG->isBaseWithConstantOffset(Ptr)) { + SDValue PtrBase = Ptr.getOperand(0); + SDValue PtrOffset = Ptr.getOperand(1); + + const APInt &OffsetVal = cast(PtrOffset)->getAPIntValue(); + if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) { + N = glueCopyToM0(N, PtrBase); + Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32); + } + } + + if (!Offset) { + N = glueCopyToM0(N, Ptr); + Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + } + + SDValue Ops[] = { + Offset, + CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32), + Chain, + N->getOperand(N->getNumOperands() - 1) // New glue + }; + + CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops); +} + bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const { Mods = 0; Index: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -109,7 +109,8 @@ // modes on SI to know the high bits are 0 so pointer adds don't wrap. We // can't represent this with range metadata because it's only allowed for // integer types. - if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) && ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) continue; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -926,7 +926,20 @@ return true; } + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + const ConstantInt *Vol = dyn_cast(CI.getOperand(1)); + if (!Vol || !Vol->isZero()) + Info.flags |= MachineMemOperand::MOVolatile; + + return true; + } default: return false; } @@ -1978,7 +1991,8 @@ auto *ParamTy = dyn_cast(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be // real pointers, so we can't guarantee their size. Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -275,6 +275,11 @@ if (OffsetImm) { // Normal, single offset LDS instruction. BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); + // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to + // report that here? + if (!BaseOp) + return false; + Offset = OffsetImm->getImm(); assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base " "operands of type register."); Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll @@ -0,0 +1,125 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s + +; GCN-LABEL: {{^}}ds_append_lds: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_append_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_append_lds_max_offset: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_append_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383 + %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_append_no_fold_offset_si: +; GCN: s_load_dword [[PTR:s[0-9]+]] + +; SI: s_add_i32 [[PTR]], [[PTR]], 16 +; SI: s_mov_b32 m0, [[PTR]] +; SI: ds_append [[RESULT:v[0-9]+]]{{$}} + +; CIPLUS: s_mov_b32 m0, [[PTR]] +; CIPLUS: ds_append [[RESULT:v[0-9]+]] offset:16{{$}} + +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_append_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 { + %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4 + %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4 + %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_append_lds_over_max_offset: +; GCN: s_load_dword [[PTR:s[0-9]+]] + +; SI: s_bitset1_b32 [[PTR]], 16 +; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000 + +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_append_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384 + %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_append_lds_vgpr_addr: +; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; GCN: s_mov_b32 m0, [[READLANE]] +; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define void @ds_append_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_append_gds: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_append [[RESULT:v[0-9]+]] gds{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_append_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { + %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gds, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_append_gds_max_offset: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532 gds{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_append_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383 + %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_append_gds_over_max_offset: +define amdgpu_kernel void @ds_append_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384 + %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_append_lds_m0_restore: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_append [[RESULT:v[0-9]+]]{{$}} +; NOTGFX9: s_mov_b32 m0, -1 +; GFX9-NOT: m0 +; GCN: _store_dword +; GCN: ds_read_b32 +define amdgpu_kernel void @ds_append_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %val0 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i1 false) + store i32 %val0, i32 addrspace(1)* %out + %val1 = load volatile i32, i32 addrspace(3)* %lds + ret void +} + +declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i1) #1 +declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly convergent nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll @@ -0,0 +1,125 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s + +; GCN-LABEL: {{^}}ds_consume_lds: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_consume_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_consume_lds_max_offset: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_consume_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383 + %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_consume_no_fold_offset_si: +; GCN: s_load_dword [[PTR:s[0-9]+]] + +; SI: s_add_i32 [[PTR]], [[PTR]], 16 +; SI: s_mov_b32 m0, [[PTR]] +; SI: ds_consume [[RESULT:v[0-9]+]]{{$}} + +; CIPLUS: s_mov_b32 m0, [[PTR]] +; CIPLUS: ds_consume [[RESULT:v[0-9]+]] offset:16{{$}} + +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_consume_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 { + %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4 + %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4 + %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_consume_lds_over_max_offset: +; GCN: s_load_dword [[PTR:s[0-9]+]] + +; SI: s_bitset1_b32 [[PTR]], 16 +; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000 + +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_consume_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384 + %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_consume_lds_vgpr_addr: +; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0 +; GCN: s_mov_b32 m0, [[READLANE]] +; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define void @ds_consume_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_consume_gds: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_consume [[RESULT:v[0-9]+]] gds{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_consume_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { + %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gds, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_consume_gds_max_offset: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532 gds{{$}} +; GCN: {{.*}}store{{.*}} [[RESULT]] +define amdgpu_kernel void @ds_consume_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383 + %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_consume_gds_over_max_offset: +define amdgpu_kernel void @ds_consume_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 { + %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384 + %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i1 false) + store i32 %val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ds_consume_lds_m0_restore: +; GCN: s_load_dword [[PTR:s[0-9]+]] +; GCN: s_mov_b32 m0, [[PTR]] +; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}} +; NOTGFX9: s_mov_b32 m0, -1 +; GFX9-NOT: m0 +; GCN: _store_dword +; GCN: ds_read_b32 +define amdgpu_kernel void @ds_consume_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 { + %val0 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i1 false) + store i32 %val0, i32 addrspace(1)* %out + %val1 = load volatile i32, i32 addrspace(3)* %lds + ret void +} + +declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i1) #1 +declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i1) #1 + +attributes #0 = { nounwind } +attributes #1 = { argmemonly convergent nounwind }