Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -463,6 +463,14 @@
 def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
 def int_amdgcn_buffer_load : AMDGPUBufferLoad;
 
+
+def int_amdgcn_s_buffer_load : Intrinsic <
+  [llvm_anyint_ty],
+  [LLVMQualPointerType<LLVMMatchType<0>, 42>,
+   llvm_i32_ty, // byte offset
+   llvm_i1_ty], // glc
+  [IntrReadMem, IntrArgMemOnly, NoCapture<0>]>;
+
 class AMDGPUBufferStore : Intrinsic <
   [],
   [llvm_anyfloat_ty,  // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -193,6 +193,7 @@
   const static unsigned CONSTANT_BUFFER_14 = 22;
   const static unsigned CONSTANT_BUFFER_15 = 23;
 
+  const static unsigned CONSTANT_ADDRESS_W_RSRC = 42;
   // Some places use this if the address space can't be determined.
   const static unsigned UNKNOWN_ADDRESS_SPACE = ~0u;
 };
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -389,6 +389,7 @@
   ATOMIC_DEC,
   BUFFER_LOAD,
   BUFFER_LOAD_FORMAT,
+  SBUFFER_LOAD,
   LAST_AMDGPU_ISD_NUMBER
 };
 
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3532,6 +3532,7 @@
   NODE_NAME_CASE(ATOMIC_DEC)
   NODE_NAME_CASE(BUFFER_LOAD)
   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
+  NODE_NAME_CASE(SBUFFER_LOAD)
   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
   }
   return nullptr;
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -222,7 +222,7 @@
     return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
          "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
-  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p42:128:128"
       "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
       "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
 }
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -114,6 +114,12 @@
   }
 
   unsigned getVectorSplitCost() { return 0; }
+
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
+
+  Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                           Type *ExpectedType);
+
 };
 
 } // end namespace llvm
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -203,6 +203,7 @@
   AMDGPUAS AS = ST->getAMDGPUAS();
   if (AddrSpace == AS.GLOBAL_ADDRESS ||
       AddrSpace == AS.CONSTANT_ADDRESS ||
+      AddrSpace == AS.CONSTANT_ADDRESS_W_RSRC ||
       AddrSpace == AS.FLAT_ADDRESS)
     return 128;
   if (AddrSpace == AS.LOCAL_ADDRESS ||
@@ -479,3 +480,34 @@
 
   return false;
 }
+
+bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                      MemIntrinsicInfo &Info) {
+  IRBuilder<> Builder(Inst);
+  switch (Inst->getIntrinsicID()) {
+  default:
+    return false;
+  case Intrinsic::amdgcn_s_buffer_load:
+    Info.ReadMem = true;
+    Info.WriteMem = false;
+
+    // We can only set this if the intrinsic is functionally equivalent to a
+    // load/store.
+    if (auto Offset = dyn_cast<ConstantInt>(Inst->getArgOperand(1))) {
+      if (Offset->isZero() &&
+          static_cast<ConstantInt*>(Inst->getArgOperand(2))->isZero()) {
+        Info.PtrVal = Inst->getArgOperand(0);
+      }
+    }
+    break;
+  }
+  return true;
+}
+
+Value *AMDGPUTTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
+                                                        Type *ExpectedType) {
+  if (Inst->getType() == ExpectedType)
+    return Inst;
+
+  return nullptr;
+}
Index: lib/Target/AMDGPU/BUFInstructions.td
===================================================================
--- lib/Target/AMDGPU/BUFInstructions.td
+++ lib/Target/AMDGPU/BUFInstructions.td
@@ -709,8 +709,8 @@
 
 // Offset in an 32-bit VGPR
 def : Pat <
-  (SIload_constant v4i32:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0)
+  (i32 (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc)),
+  (BUFFER_LOAD_DWORD_OFFEN $offset, $sbase, (i32 0), 0, (as_i1imm $glc), 0, 0)
 >;
 
 
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -195,6 +195,8 @@
                          EVT VT) const override;
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  void LowerOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                             SelectionDAG &DAG) const override;
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -35,6 +35,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/DAGCombine.h"
@@ -544,6 +545,15 @@
     Info.writeMem = true;
     return true;
   }
+  case Intrinsic::amdgcn_s_buffer_load:
+    Info.opc = AMDGPUISD::SBUFFER_LOAD;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align = 0;
+    Info.vol = false;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
   default:
     return false;
   }
@@ -628,7 +638,8 @@
     }
 
     return isLegalMUBUFAddressingMode(AM);
-  } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
+  } else if (AS == AMDGPUASI.CONSTANT_ADDRESS ||
+	     AS == AMDGPUASI.CONSTANT_ADDRESS_W_RSRC) {
     // If the offset isn't a multiple of 4, it probably isn't going to be
     // correctly aligned.
     // FIXME: Can we get the real alignment here?
@@ -2152,6 +2163,36 @@
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
 
+void SITargetLowering::LowerOperationWrapper(SDNode *N,
+                                             SmallVectorImpl<SDValue> &Results,
+                                             SelectionDAG &DAG) const {
+
+  if (N->getOpcode() != AMDGPUISD::SBUFFER_LOAD) {
+    TargetLowering::LowerOperationWrapper(N, Results, DAG);
+    return;
+  }
+
+  SDLoc DL(N);
+  MemSDNode *M = cast<MemSDNode>(N);
+  SDValue Ops[] = {
+    M->getOperand(0), // Chain
+    DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, M->getOperand(1)), // Ptr
+    M->getOperand(2), // Offset
+    DAG.getTargetConstant(cast<ConstantSDNode>(
+      M->getOperand(3))->getZExtValue(), DL, MVT::i1) // glc
+  };
+
+  auto MMO = M->getMemOperand();
+  if (isDereferenceablePointer(MMO->getValue(), DAG.getDataLayout()))
+    MMO->setFlags(MachineMemOperand::MODereferenceable);
+
+  SDValue LD = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+                                       M->getVTList(), Ops, M->getMemoryVT(),
+                                       M->getMemOperand());
+  Results.push_back(LD);
+  Results.push_back(LD.getValue(1));
+}
+
 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -2970,8 +3011,10 @@
       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
   case AMDGPUIntrinsic::SI_load_const: {
     SDValue Ops[] = {
-      Op.getOperand(1),
-      Op.getOperand(2)
+      DAG.getEntryNode(), // Chain
+      Op.getOperand(1), // Ptr
+      Op.getOperand(2), // Offset
+      DAG.getTargetConstant(0, DL, MVT::i1) // glc
     };
 
     MachineMemOperand *MMO = MF.getMachineMemOperand(
@@ -2979,8 +3022,16 @@
         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
             MachineMemOperand::MOInvariant,
         VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
+    SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
+    SDValue Load = DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
+                                           VTList, Ops, MVT::i32, MMO);
+
+    SDValue MergeOps[] = {
+      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load),
+      Load.getValue(1)
+    };
+
+    return DAG.getMergeValues(MergeOps, DL);
   }
   case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -26,11 +26,6 @@
 // SI DAG Nodes
 //===----------------------------------------------------------------------===//
 
-def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
-  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
-                      [SDNPMayLoad, SDNPMemOperand]
->;
-
 def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
@@ -92,6 +87,11 @@
 defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
 defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
 
+def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
+  SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>]>,
+  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]
+>;
+
 //===----------------------------------------------------------------------===//
 // SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
 // to be glued to the memory instructions.
Index: lib/Target/AMDGPU/SMInstructions.td
===================================================================
--- lib/Target/AMDGPU/SMInstructions.td
+++ lib/Target/AMDGPU/SMInstructions.td
@@ -276,15 +276,16 @@
 defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
 
 // 1. Offset as an immediate
-def SM_LOAD_PATTERN : Pat <  // name this pattern to reuse AddedComplexity on CI
-  (SIload_constant v4i32:$sbase, (SMRDBufferImm i32:$offset)),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, 0)
+// name this pattern to reuse AddedComplexity on CI
+def SM_LOAD_PATTERN : Pat <
+  (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc)),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, $offset, (as_i1imm $glc))
 >;
 
 // 2. Offset loaded in an 32bit SGPR
 def : Pat <
-  (SIload_constant v4i32:$sbase, (SMRDBufferSgpr i32:$offset)),
-  (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, 0)
+  (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferSgpr i32:$offset), i1:$glc)),
+  (S_BUFFER_LOAD_DWORD_SGPR $sbase, $offset, (as_i1imm $glc))
 >;
 
 } // End let AddedComplexity = 100
@@ -520,8 +521,8 @@
 def : SMRD_Pattern_ci <"S_LOAD_DWORDX16", v16i32>;
 
 def : Pat <
-  (SIload_constant v4i32:$sbase, (SMRDBufferImm32 i32:$offset)),
-  (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, 0)> {
+  (i32 (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc)),
+  (S_BUFFER_LOAD_DWORD_IMM_ci $sbase, $offset, $glc)> {
   let Predicates = [isCI]; // should this be isCIOnly?
 }
 
Index: test/CodeGen/AMDGPU/mubuf.ll
===================================================================
--- test/CodeGen/AMDGPU/mubuf.ll
+++ test/CodeGen/AMDGPU/mubuf.ll
@@ -84,6 +84,23 @@
   ret void
 }
 
+; Using the load.const intrinsic with an vgpr offset
+; CHECK-LABEL: {{^}}s_buffer_load:
+; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen
+; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen
+define amdgpu_ps void @s_buffer_load(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float, i32 addrspace(42)* addrspace(2)* inreg %in) {
+main_body:
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
+  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
+  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 %tid)
+  %23 = load i32 addrspace(42)*, i32 addrspace(42)* addrspace(2)* %in
+  %s.buffer = call i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* %23, i32 %tid, i1 false)
+  %s.buffer.float = bitcast i32 %s.buffer to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %22, float %22, float %22, float %s.buffer.float, i1 1, i1 0)
+  ret void
+}
+
 ;;;==========================================================================;;;
 ;;; MUBUF STORE TESTS
 ;;;==========================================================================;;;
@@ -174,7 +191,12 @@
   ret void
 }
 
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare i32 @llvm.amdgcn.s.buffer.load.i32(i32 addrspace(42)* nocapture, i32, i1)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1)
 
 attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind readnone }