Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -416,6 +416,14 @@
 def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
 def int_amdgcn_buffer_load : AMDGPUBufferLoad;
 
+
+def int_amdgcn_s_buffer_load : Intrinsic <
+  [llvm_anyint_ty],
+  [LLVMQualPointerType<LLVMMatchType<0>, 42>,
+   llvm_i32_ty, // byte offset
+   llvm_i1_ty], // glc
+  [IntrReadMem, IntrArgMemOnly, NoCapture<0>]>;
+
 class AMDGPUBufferStore : Intrinsic <
   [],
   [llvm_anyfloat_ty,  // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -169,6 +169,7 @@
   CONSTANT_BUFFER_14 = 22,
   CONSTANT_BUFFER_15 = 23,
 
+  CONSTANT_ADDRESS_W_RSRC = 42,
   // Some places use this if the address space can't be determined.
   UNKNOWN_ADDRESS_SPACE = ~0u
 };
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -340,6 +340,7 @@
   ATOMIC_DEC,
   BUFFER_LOAD,
   BUFFER_LOAD_FORMAT,
+  SBUFFER_LOAD,
   LAST_AMDGPU_ISD_NUMBER
 };
 
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3305,6 +3305,7 @@
   NODE_NAME_CASE(ATOMIC_DEC)
   NODE_NAME_CASE(BUFFER_LOAD)
   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+  NODE_NAME_CASE(SBUFFER_LOAD)
   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
   }
   return nullptr;
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -164,7 +164,7 @@
 
   // 32-bit private, local, and region pointers. 64-bit global, constant and
   // flat.
-  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p42:128:128"
          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
 }
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -102,6 +102,12 @@
   }
 
   unsigned getVectorSplitCost() { return 0; }
+
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
+
+  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                           Type *ExpectedType);
+
 };
 
 } // end namespace llvm
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -84,6 +84,7 @@
   switch (AddrSpace) {
   case AMDGPUAS::GLOBAL_ADDRESS:
   case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS_W_RSRC:
   case AMDGPUAS::FLAT_ADDRESS:
     return 128;
   case AMDGPUAS::LOCAL_ADDRESS:
@@ -341,3 +342,36 @@
 
   return false;
 }
+
+bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                      MemIntrinsicInfo &Info) {
+  IRBuilder<> Builder(Inst);
+  switch (Inst->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::amdgcn_s_buffer_load:
+    Info.ReadMem = true;
+    Info.WriteMem = false;
+    Info.IsSimple = true;
+    Info.NumMemRefs = 1;
+
+    // We can only set this if the intrinsic is functionally equivalent to a
+    // load/store.
+    if (auto Offset = dyn_cast<ConstantInt>(Inst->getArgOperand(1))) {
+      if (Offset->isZero() &&
+          static_cast<ConstantInt*>(Inst->getArgOperand(2))->isZero()) {
+        Info.PtrVal = Inst->getArgOperand(0);
+      }
+    }
+    break;
+  }
+  return true;
+}
+
+Value *AMDGPUTTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                                        Type *ExpectedType) {
+  if (Inst->getType() == ExpectedType)
+    return Inst;
+
+  return nullptr;
+}
Index: lib/Target/AMDGPU/BUFInstructions.td
===================================================================
--- lib/Target/AMDGPU/BUFInstructions.td
+++ lib/Target/AMDGPU/BUFInstructions.td
@@ -713,8 +713,8 @@
 
 // Offset in an 32-bit VGPR
 def : Pat <
-  (SIload_constant v4i32:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0)
+  (i32 (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc)),
+  (BUFFER_LOAD_DWORD_OFFEN $offset, $sbase, (i32 0), 0, (as_i1imm $glc), 0, 0)
 >;
 
 
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -176,6 +176,8 @@
                          EVT VT) const override;
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                             SelectionDAG &DAG) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -34,6 +34,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/DAGCombine.h"
@@ -431,6 +432,15 @@
     Info.readMem = true;
     Info.writeMem = true;
     return true;
+  case Intrinsic::amdgcn_s_buffer_load:
+    Info.opc = AMDGPUISD::SBUFFER_LOAD;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
   default:
     return false;
   }
@@ -508,6 +518,7 @@
 
     return isLegalMUBUFAddressingMode(AM);
 
+  case AMDGPUAS::CONSTANT_ADDRESS_W_RSRC:
   case AMDGPUAS::CONSTANT_ADDRESS:
     // If the offset isn't a multiple of 4, it probably isn't going to be
     // correctly aligned.
@@ -1939,6 +1950,37 @@
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
 
+
+void SITargetLowering::LowerOperationWrapper(SDNode *N,
+                                             SmallVectorImpl<SDValue> &Results,
+                                             SelectionDAG &DAG) const {
+
+  if (N->getOpcode() != AMDGPUISD::SBUFFER_LOAD) {
+    TargetLowering::LowerOperationWrapper(N, Results, DAG);
+    return;
+  }
+
+  SDLoc DL(N);
+  MemSDNode *M = cast<MemSDNode>(N);
+  SDValue Ops[] = {
+    M->getOperand(0), // Chain
+    DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, M->getOperand(1)), // Ptr
+    M->getOperand(2), // Offset
+    DAG.getTargetConstant(cast<ConstantSDNode>(
+      M->getOperand(3))->getZExtValue(), DL, MVT::i1) // glc
+  };
+
+  auto MMO = M->getMemOperand();
+  if (isDereferenceablePointer(MMO->getValue(), DAG.getDataLayout()))
+    MMO->setFlags(MachineMemOperand::MODereferenceable);
+
+  SDValue LD = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+                                       M->getVTList(), Ops, M->getMemoryVT(),
+                                       M->getMemOperand());
+  Results.push_back(LD);
+  Results.push_back(LD.getValue(1));
+}
+
 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -2643,8 +2685,10 @@
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
   case AMDGPUIntrinsic::SI_load_const: {
     SDValue Ops[] = {
-      Op.getOperand(1),
-      Op.getOperand(2)
+      DAG.getEntryNode(), // Chain
+      Op.getOperand(1), // Ptr
+      Op.getOperand(2), // Offset
+      DAG.getTargetConstant(0, DL, MVT::i1) // glc
     };
 
     MachineMemOperand *MMO = MF.getMachineMemOperand(
@@ -2652,8 +2696,16 @@
         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
             MachineMemOperand::MOInvariant,
         VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+    SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
+    SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+                                           VTList, Ops, MVT::i32, MMO);
+
+    SDValue MergeOps[] = {
+      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load),
+      Load.getValue(1)
+    };
+
+    return DAG.getMergeValues(MergeOps, DL);
   }
   case AMDGPUIntrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -26,11 +26,6 @@
 // SI DAG Nodes
 //===----------------------------------------------------------------------===//
 
-def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
-  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
-                      [SDNPMayLoad, SDNPMemOperand]
->;
-
 def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
@@ -97,6 +92,11 @@
 defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
 defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
 
+def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
+  SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>,
+  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]
+>;
+
 //===----------------------------------------------------------------------===//
 // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
 // to be glued to the memory instructions.
Index: lib/Target/AMDGPU/SMInstructions.td
===================================================================
--- lib/Target/AMDGPU/SMInstructions.td
+++ lib/Target/AMDGPU/SMInstructions.td
@@ -276,15 +276,16 @@
 defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
 
 // 1. Offset as an immediate
-def SM_LOAD_PATTERN : Pat <  // name this pattern to reuse AddedComplexity on CI
-  (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
+// name this pattern to reuse AddedComplexity on CI
+def SM_LOAD_PATTERN : Pat <
+  (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc)),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, (as_i1imm $glc))
 >;
 
 // 2. Offset loaded in an 32bit SGPR
 def : Pat <
-  (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
-  (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
+  (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferSgpr i32:$offset), i1:$glc)),
+  (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, (as_i1imm $glc))
 >;
 
 } // End let AddedComplexity = 100
@@ -520,8 +521,8 @@
 def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
 
 def : Pat <
-  (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
-  (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
+  (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)),
+  (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, $glc)> {
   let Predicates = [isCI]; // should this be isCIOnly?
 }
 
Index: test/CodeGen/AMDGPU/mubuf.ll
===================================================================
--- test/CodeGen/AMDGPU/mubuf.ll
+++ test/CodeGen/AMDGPU/mubuf.ll
@@ -84,6 +84,23 @@
   ret void
 }
 
+; Using the load.const intrinsic with an vgpr offset
+; CHECK-LABEL: {{^}}s_buffer_load:
+; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen
+; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen
+define amdgpu_ps void @s_buffer_load(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) {
+main_body:
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 %tid)
+  %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in
+  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 %tid, i1 false)
+  %s.buffer.float = bitcast i32 %s.buffer to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float)
+  ret void
+}
+
 ;;;==========================================================================;;;
 ;;; MUBUF STORE TESTS
 ;;;==========================================================================;;;
@@ -174,7 +191,12 @@
   ret void
 }
 
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* nocapture, i32, i1)
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
 attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/smrd.ll
===================================================================
--- test/CodeGen/AMDGPU/smrd.ll
+++ test/CodeGen/AMDGPU/smrd.ll
@@ -87,13 +87,18 @@
 ; SMRD load using the load.const intrinsic with an immediate offset
 ; GCN-LABEL: {{^}}smrd_load_const0:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in
+  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 16, i1 false)
+  %s.buffer.float = bitcast i32 %s.buffer to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float)
   ret void
 }
 
@@ -101,13 +106,18 @@
 ; offset.
 ; GCN-LABEL: {{^}}smrd_load_const1:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in
+  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 1020, i1 false)
+  %s.buffer.float = bitcast i32 %s.buffer to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float)
   ret void
 }
 ; SMRD load using the load.const intrinsic with an offset greater than the
@@ -116,14 +126,20 @@
 ; GCN-LABEL: {{^}}smrd_load_const2:
 ; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in
+  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 1024, i1 false)
+  %s.buffer.float = bitcast i32 %s.buffer to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float)
   ret void
 }
 
@@ -131,14 +147,20 @@
 ; GCN-LABEL: {{^}}smrd_load_const3:
 ; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
+define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048572)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in
+  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 1048572, i1 false)
+  %s.buffer.float = bitcast i32 %s.buffer to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float)
   ret void
 }
 
@@ -146,14 +168,19 @@
 ; GCN-LABEL: {{^}}smrd_load_const4:
 ; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
 ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) {
 main_body:
   %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
   %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
   %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048576)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in
+  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 1048576, i1 false)
+  %s.buffer.float = bitcast i32 %s.buffer to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %s.buffer.float)
   ret void
 }
 
@@ -162,4 +189,6 @@
 
 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
+declare i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* nocapture, i32, i1)
+
 attributes #0 = { nounwind readnone }
Index: test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll
===================================================================
--- /dev/null
+++ test/Transforms/EarlyCSE/AMDGPU/intrinsics.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -S -mtriple=amdgcn-- -early-cse | FileCheck %s
+
+; CHECK-LABEL: @no_cse
+; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 0, i1 false)
+; CHECK: call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 4, i1 false)
+define void @no_cse(i32 addrspace(1)* %out, i32 addrspace(42)* %in) {
+  %a = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 0, i1 false)
+  %b = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 4, i1 false)
+  %c = add i32 %a, %b
+  store i32 %c, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @cse_zero_offset
+; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 0, i1 false)
+; CHECK: add i32 [[CSE]], [[CSE]]
+define void @cse_zero_offset(i32 addrspace(1)* %out, i32 addrspace(42)* %in) {
+  %a = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 0, i1 false)
+  %b = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 0, i1 false)
+  %c = add i32 %a, %b
+  store i32 %c, i32 addrspace(1)* %out
+  ret void
+}
+
+; CHECK-LABEL: @cse_nonzero_offset
+; CHECK: [[CSE:%[a-z0-9A-Z]+]] = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 4, i1 false)
+; CHECK: add i32 [[CSE]], [[CSE]]
+define void @cse_nonzero_offset(i32 addrspace(1)* %out, i32 addrspace(42)* %in) {
+  %a = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 4, i1 false)
+  %b = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %in, i32 4, i1 false)
+  %c = add i32 %a, %b
+  store i32 %c, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* nocapture, i32, i1)