Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -406,9 +406,22 @@
   [NoCapture<0>]
 >;
 
+class AMDGPUDSAppendConsumedIntrinsic : Intrinsic<
+  [llvm_i32_ty],
+  [llvm_anyptr_ty, // LDS or GDS ptr
+   llvm_i32_ty, // ordering
+   llvm_i32_ty, // scope
+   llvm_i1_ty], // isVolatile
+   [IntrConvergent, IntrArgMemOnly, NoCapture<0>]
+>;
+
 def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic;
 def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic;
 
+// The pointer argument is assumed to be dynamically uniform if a VGPR.
+def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic;
+def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic;
+
 def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">;
 def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">;
 def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">;
Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -106,12 +106,13 @@
 
   MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
 
-  SDNode *glueCopyToM0(SDNode *N) const;
+  SDNode *glueCopyToM0LDSInit(SDNode *N) const;
+  SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
 
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
   virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
   virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+  bool isDSOffsetLegal(SDValue Base, unsigned Offset,
                        unsigned OffsetBits) const;
   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@@ -209,6 +210,7 @@
   void SelectBRCOND(SDNode *N);
   void SelectFMAD_FMA(SDNode *N);
   void SelectATOMIC_CMP_SWAP(SDNode *N);
+  void SelectINTRINSIC_W_CHAIN(SDNode *N);
 
 protected:
   // Include the pieces autogenerated from the target description.
@@ -339,29 +341,32 @@
   }
 }
 
-SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
-  if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
-      !Subtarget->ldsRequiresM0Init())
-    return N;
-
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
   const SITargetLowering& Lowering =
-      *static_cast<const SITargetLowering*>(getTargetLowering());
+    *static_cast<const SITargetLowering*>(getTargetLowering());
 
   // Write max value to m0 before each load operation
 
   SDValue M0 = Lowering.copyToM0(*CurDAG, CurDAG->getEntryNode(), SDLoc(N),
-                                 CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+                                 Val);
 
   SDValue Glue = M0.getValue(1);
 
   SmallVector <SDValue, 8> Ops;
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-     Ops.push_back(N->getOperand(i));
-  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    Ops.push_back(N->getOperand(i));
+
   Ops.push_back(Glue);
   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
 }
 
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
+  if (cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS ||
+      !Subtarget->ldsRequiresM0Init())
+    return N;
+  return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
+}
+
 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
                                                   EVT VT) const {
   SDNode *Lo = CurDAG->getMachineNode(
@@ -472,7 +477,7 @@
        Opc == ISD::ATOMIC_LOAD_FADD ||
        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
        Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
-    N = glueCopyToM0(N);
+    N = glueCopyToM0LDSInit(N);
 
   switch (Opc) {
   default:
@@ -570,7 +575,7 @@
   case ISD::STORE:
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE: {
-    N = glueCopyToM0(N);
+    N = glueCopyToM0LDSInit(N);
     break;
   }
 
@@ -648,6 +653,12 @@
       SelectCode(N);
       return;
     }
+
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+    SelectINTRINSIC_W_CHAIN(N);
+    return;
   }
   }
 
@@ -828,7 +839,7 @@
   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
 }
 
-bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
+bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset,
                                          unsigned OffsetBits) const {
   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
       (OffsetBits == 8 && !isUInt<8>(Offset)))
@@ -1760,6 +1771,52 @@
   CurDAG->RemoveDeadNode(N);
 }
 
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
+  unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  if ((IntrID != Intrinsic::amdgcn_ds_append &&
+       IntrID != Intrinsic::amdgcn_ds_consume) ||
+      N->getValueType(0) != MVT::i32) {
+    SelectCode(N);
+    return;
+  }
+
+  // The address is assumed to be uniform, so if it ends up in a VGPR, it will
+  // be copied to an SGPR with readfirstlane.
+  unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
+    AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(2);
+  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+  bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+
+  SDValue Offset;
+  if (CurDAG->isBaseWithConstantOffset(Ptr)) {
+    SDValue PtrBase = Ptr.getOperand(0);
+    SDValue PtrOffset = Ptr.getOperand(1);
+
+    const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
+    if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue(), 16)) {
+      N = glueCopyToM0(N, PtrBase);
+      Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
+    }
+  }
+
+  if (!Offset) {
+    N = glueCopyToM0(N, Ptr);
+    Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+  }
+
+  SDValue Ops[] = {
+    Offset,
+    CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
+    Chain,
+    N->getOperand(N->getNumOperands() - 1) // New glue
+  };
+
+  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
                                             unsigned &Mods) const {
   Mods = 0;
Index: lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -109,7 +109,8 @@
       // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
       // can't represent this with range metadata because it's only allowed for
       // integer types.
-      if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+      if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+           PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) &&
           ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
         continue;
 
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -926,7 +926,20 @@
 
     return true;
   }
+  case Intrinsic::amdgcn_ds_append:
+  case Intrinsic::amdgcn_ds_consume: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align = 0;
+    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
+    const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(3));
+    if (!Vol || !Vol->isZero())
+      Info.flags |= MachineMemOperand::MOVolatile;
+
+    return true;
+  }
   default:
     return false;
   }
@@ -1978,7 +1991,8 @@
       auto *ParamTy =
         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
-          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+          ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+                      ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
         // On SI local pointers are just offsets into LDS, so they are always
         // less than 16-bits.  On CI and newer they could potentially be
         // real pointers, so we can't guarantee their size.
@@ -5488,6 +5502,31 @@
   SDLoc DL(Op);
 
   switch (IntrID) {
+#if 0
+  case Intrinsic::amdgcn_ds_consume:
+  case Intrinsic::amdgcn_ds_append: {
+    MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(Op);
+    SDValue Chain = M->getChain();
+    SDValue Ptr = M->getBasePtr();
+
+    SDValue Ptr, Offset;
+
+    if (CurDAG->isBaseWithConstantOffset(Ptr)) {
+      SDValue N0 = Addr.getOperand(0);
+      SDValue N1 = Addr.getOperand(1);
+
+    }
+
+#if 0
+    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
+    SDValue Glue = Chain.getValue(1);
+    return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
+                       Op.getOperand(2), Glue);
+#endif
+
+    return SDValue();
+  }
+#endif
   case Intrinsic::amdgcn_ds_ordered_add:
   case Intrinsic::amdgcn_ds_ordered_swap: {
     MemSDNode *M = cast<MemSDNode>(Op);
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -275,6 +275,11 @@
     if (OffsetImm) {
       // Normal, single offset LDS instruction.
       BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
+      // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
+      // report that here?
+      if (!BaseOp)
+        return false;
+
       Offset = OffsetImm->getImm();
       assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
                                 "operands of type register.");
Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.append.ll
@@ -0,0 +1,125 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
+
+; GCN-LABEL: {{^}}ds_append_lds:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_lds_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_no_fold_offset_si:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+
+; SI: s_add_i32 [[PTR]], [[PTR]], 16
+; SI: s_mov_b32 m0, [[PTR]]
+; SI: ds_append [[RESULT:v[0-9]+]]{{$}}
+
+; CIPLUS: s_mov_b32 m0, [[PTR]]
+; CIPLUS: ds_append [[RESULT:v[0-9]+]] offset:16{{$}}
+
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 {
+  %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4
+  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_lds_over_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+
+; SI: s_bitset1_b32 [[PTR]], 16
+; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000
+
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_lds_vgpr_addr:
+; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
+; GCN: s_mov_b32 m0, [[READLANE]]
+; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define void @ds_append_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_gds:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]] gds{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gds, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_gds_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]] offset:65532 gds{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_append_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_gds_over_max_offset:
+define amdgpu_kernel void @ds_append_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* %gep, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_append_lds_m0_restore:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_append [[RESULT:v[0-9]+]]{{$}}
+; NOTGFX9: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+; GCN: _store_dword
+; GCN: ds_read_b32
+define amdgpu_kernel void @ds_append_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val0 = call i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
+  store i32 %val0, i32 addrspace(1)* %out
+  %val1 = load volatile i32, i32 addrspace(3)* %lds
+  ret void
+}
+
+declare i32 @llvm.amdgcn.ds.append.p3i32(i32 addrspace(3)* nocapture, i32, i32, i1) #1
+declare i32 @llvm.amdgcn.ds.append.p2i32(i32 addrspace(2)* nocapture, i32, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly convergent nounwind }
Index: test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.ds.consume.ll
@@ -0,0 +1,125 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,NOTGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,CIPLUS,GFX9 %s
+
+; GCN-LABEL: {{^}}ds_consume_lds:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_lds(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_lds_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_lds_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_no_fold_offset_si:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+
+; SI: s_add_i32 [[PTR]], [[PTR]], 16
+; SI: s_mov_b32 m0, [[PTR]]
+; SI: ds_consume [[RESULT:v[0-9]+]]{{$}}
+
+; CIPLUS: s_mov_b32 m0, [[PTR]]
+; CIPLUS: ds_consume [[RESULT:v[0-9]+]] offset:16{{$}}
+
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_no_fold_offset_si(i32 addrspace(3)* addrspace(4)* %lds.ptr, i32 addrspace(1)* %out) #0 {
+  %lds = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* %lds.ptr, align 4
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 4
+  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_lds_over_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+
+; SI: s_bitset1_b32 [[PTR]], 16
+; CIPLUS: s_add_i32 [[PTR]], [[PTR]], 0x10000
+
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_lds_over_max_offset(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(3)* %lds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %gep, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_lds_vgpr_addr:
+; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
+; GCN: s_mov_b32 m0, [[READLANE]]
+; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define void @ds_consume_lds_vgpr_addr(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_gds:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]] gds{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_gds(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gds, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_gds_max_offset:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]] offset:65532 gds{{$}}
+; GCN: {{.*}}store{{.*}} [[RESULT]]
+define amdgpu_kernel void @ds_consume_gds_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16383
+  %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_gds_over_max_offset:
+define amdgpu_kernel void @ds_consume_gds_over_max_offset(i32 addrspace(2)* %gds, i32 addrspace(1)* %out) #0 {
+  %gep = getelementptr inbounds i32, i32 addrspace(2)* %gds, i32 16384
+  %val = call i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* %gep, i32 0, i32 0, i1 false)
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}ds_consume_lds_m0_restore:
+; GCN: s_load_dword [[PTR:s[0-9]+]]
+; GCN: s_mov_b32 m0, [[PTR]]
+; GCN: ds_consume [[RESULT:v[0-9]+]]{{$}}
+; NOTGFX9: s_mov_b32 m0, -1
+; GFX9-NOT: m0
+; GCN: _store_dword
+; GCN: ds_read_b32
+define amdgpu_kernel void @ds_consume_lds_m0_restore(i32 addrspace(3)* %lds, i32 addrspace(1)* %out) #0 {
+  %val0 = call i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* %lds, i32 0, i32 0, i1 false)
+  store i32 %val0, i32 addrspace(1)* %out
+  %val1 = load volatile i32, i32 addrspace(3)* %lds
+  ret void
+}
+
+declare i32 @llvm.amdgcn.ds.consume.p3i32(i32 addrspace(3)* nocapture, i32, i32, i1) #1
+declare i32 @llvm.amdgcn.ds.consume.p2i32(i32 addrspace(2)* nocapture, i32, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly convergent nounwind }