Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -437,6 +437,39 @@
 def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
 def int_amdgcn_buffer_store : AMDGPUBufferStore;
 
+def int_amdgcn_tbuffer_load : Intrinsic <
+    [llvm_anyint_ty], // overloaded for types i32, v2i32, v4i32
+    [llvm_v4i32_ty,   // rsrc(SGPR)
+     llvm_i32_ty,     // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
+     llvm_i32_ty,     // vaddr(VGPR)
+     llvm_i32_ty,     // soffset(SGPR)
+     llvm_i32_ty,     // inst_offset(imm)
+     llvm_i32_ty,     // dfmt(imm)
+     llvm_i32_ty,     // nfmt(imm)
+     llvm_i32_ty,     // offen(imm)
+     llvm_i32_ty,     // idxen(imm)
+     llvm_i32_ty,     // glc(imm)
+     llvm_i32_ty,     // slc(imm)
+     llvm_i32_ty],    // tfe(imm)
+    []>;
+
+def int_amdgcn_tbuffer_store : Intrinsic <
+    [],
+    [llvm_v4i32_ty,  // rsrc(SGPR)
+     llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+     llvm_i32_ty,    // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
+     llvm_i32_ty,    // vaddr(VGPR)
+     llvm_i32_ty,    // soffset(SGPR)
+     llvm_i32_ty,    // inst_offset(imm)
+     llvm_i32_ty,    // dfmt(imm)
+     llvm_i32_ty,    // nfmt(imm)
+     llvm_i32_ty,    // offen(imm)
+     llvm_i32_ty,    // idxen(imm)
+     llvm_i32_ty,    // glc(imm)
+     llvm_i32_ty,    // slc(imm)
+     llvm_i32_ty],   // tfe(imm)
+    []>;
+
 class AMDGPUBufferAtomic : Intrinsic <
   [llvm_i32_ty],
   [llvm_i32_ty,       // vdata(VGPR)
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -349,6 +349,7 @@
   STORE_MSKOR,
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
+  TBUFFER_LOAD_FORMAT,
   ATOMIC_CMP_SWAP,
   ATOMIC_INC,
   ATOMIC_DEC,
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3482,6 +3482,7 @@
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   NODE_NAME_CASE(ATOMIC_INC)
   NODE_NAME_CASE(ATOMIC_DEC)
Index: lib/Target/AMDGPU/BUFInstructions.td
===================================================================
--- lib/Target/AMDGPU/BUFInstructions.td
+++ lib/Target/AMDGPU/BUFInstructions.td
@@ -674,9 +674,9 @@
 // MTBUF Instructions
 //===----------------------------------------------------------------------===//
 
-//def TBUFFER_LOAD_FORMAT_X    : MTBUF_ <0, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY   : MTBUF_ <1, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ  : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
+def TBUFFER_LOAD_FORMAT_X     : MTBUF_Load_Pseudo  <"tbuffer_load_format_x", VGPR_32>;
+def TBUFFER_LOAD_FORMAT_XY    : MTBUF_Load_Pseudo  <"tbuffer_load_format_xy", VReg_64>;
+def TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Load_Pseudo  <"tbuffer_load_format_xyz", VReg_128>;
 def TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Load_Pseudo  <"tbuffer_load_format_xyzw", VReg_128>;
 def TBUFFER_STORE_FORMAT_X    : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
 def TBUFFER_STORE_FORMAT_XY   : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
@@ -1078,6 +1078,23 @@
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
 
+// TBUFFER_LOAD_FORMAT_*, addr64=0
+class MTBUF_LoadResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
+  (vt (SItbuffer_load v4i32:$rsrc, num_channels, i32:$vaddr,
+                      i32:$soffset, imm:$inst_offset, imm:$dfmt,
+                      imm:$nfmt, imm:$offen, imm:$idxen,
+                      imm:$glc, imm:$slc, imm:$tfe)),
+  (opcode
+    (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
+    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
+    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
+>;
+
+def : MTBUF_LoadResource <i32, 1, TBUFFER_LOAD_FORMAT_X>;
+def : MTBUF_LoadResource <v2i32, 2, TBUFFER_LOAD_FORMAT_XY>;
+def : MTBUF_LoadResource <v4i32, 3, TBUFFER_LOAD_FORMAT_XYZ>;
+def : MTBUF_LoadResource <v4i32, 4, TBUFFER_LOAD_FORMAT_XYZW>;
+
 // TBUFFER_STORE_FORMAT_*, addr64=0
 class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
   (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
@@ -1218,6 +1235,9 @@
   let Inst{18-16} = op;
 }
 
+def TBUFFER_LOAD_FORMAT_X_si     : MTBUF_Real_si <0, TBUFFER_LOAD_FORMAT_X>;
+def TBUFFER_LOAD_FORMAT_XY_si    : MTBUF_Real_si <1, TBUFFER_LOAD_FORMAT_XY>;
+def TBUFFER_LOAD_FORMAT_XYZ_si   : MTBUF_Real_si <2, TBUFFER_LOAD_FORMAT_XYZ>;
 def TBUFFER_LOAD_FORMAT_XYZW_si  : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
 def TBUFFER_STORE_FORMAT_X_si    : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
 def TBUFFER_STORE_FORMAT_XY_si   : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
@@ -1342,6 +1362,9 @@
   let Inst{18-15} = op;
 }
 
+def TBUFFER_LOAD_FORMAT_X_vi     : MTBUF_Real_vi <0, TBUFFER_LOAD_FORMAT_X>;
+def TBUFFER_LOAD_FORMAT_XY_vi    : MTBUF_Real_vi <1, TBUFFER_LOAD_FORMAT_XY>;
+def TBUFFER_LOAD_FORMAT_XYZ_vi   : MTBUF_Real_vi <2, TBUFFER_LOAD_FORMAT_XYZ>;
 def TBUFFER_LOAD_FORMAT_XYZW_vi  : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
 def TBUFFER_STORE_FORMAT_X_vi    : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
 def TBUFFER_STORE_FORMAT_XY_vi   : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2938,6 +2938,8 @@
                                                  SelectionDAG &DAG) const {
   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   SDLoc DL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
   case Intrinsic::amdgcn_atomic_dec: {
@@ -2963,7 +2965,6 @@
       Op.getOperand(5), // glc
       Op.getOperand(6)  // slc
     };
-    MachineFunction &MF = DAG.getMachineFunction();
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
@@ -2978,6 +2979,32 @@
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
   }
+  case Intrinsic::amdgcn_tbuffer_load: {
+    SDValue Ops[] = {
+      Op.getOperand(0),  // Chain
+      Op.getOperand(2),  // rsrc
+      Op.getOperand(3),  // num channels
+      Op.getOperand(4),  // vaddr
+      Op.getOperand(5),  // soffset
+      Op.getOperand(6),  // instr offset
+      Op.getOperand(7),  // dfmt
+      Op.getOperand(8),  // nfmt
+      Op.getOperand(9),  // offen
+      Op.getOperand(10),  // idxen
+      Op.getOperand(11), // glc
+      Op.getOperand(12), // slc
+      Op.getOperand(13)  // tfe
+    };
+
+    EVT VT = Op.getOperand(2).getValueType();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad,
+      VT.getStoreSize(), 4);
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);    
+  }
   default:
     return SDValue();
   }
@@ -3047,7 +3074,8 @@
     return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
                        Op.getOperand(2), Glue);
   }
-  case AMDGPUIntrinsic::SI_tbuffer_store: {
+  case AMDGPUIntrinsic::SI_tbuffer_store: // Retained for backward compatibility
+  case Intrinsic::amdgcn_tbuffer_store: {
     SDValue Ops[] = {
       Chain,
       Op.getOperand(2),
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -39,6 +39,25 @@
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
+  SDTypeProfile<1, 12,
+    [SDTCisVT<0, iAny>,   // vdata(VGPR)
+     SDTCisVT<1, v4i32>,  // rsrc(SGPR)
+     SDTCisVT<2, i32>,    // num_channels(imm)
+     SDTCisVT<3, i32>,    // vaddr(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // offen(imm)
+     SDTCisVT<9, i32>,    // idxen(imm)
+     SDTCisVT<10, i32>,   // glc(imm)
+     SDTCisVT<11, i32>,   // slc(imm)
+     SDTCisVT<12, i32>    // tfe(imm)
+    ]>,
+  [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
 def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
   SDTypeProfile<0, 13,
     [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
Index: lib/Target/AMDGPU/SIIntrinsics.td
===================================================================
--- lib/Target/AMDGPU/SIIntrinsics.td
+++ lib/Target/AMDGPU/SIIntrinsics.td
@@ -32,6 +32,23 @@
   def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
 
+  // Fully-flexible TBUFFER_LOAD_FORMAT_* except for the ADDR64 bit, which is not exposed
+  def int_SI_tbuffer_load : Intrinsic <
+    [llvm_anyint_ty], // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+    [llvm_anyint_ty,  // rsrc(SGPR)
+     llvm_i32_ty,     // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
+     llvm_i32_ty,     // vaddr(VGPR)
+     llvm_i32_ty,     // soffset(SGPR)
+     llvm_i32_ty,     // inst_offset(imm)
+     llvm_i32_ty,     // dfmt(imm)
+     llvm_i32_ty,     // nfmt(imm)
+     llvm_i32_ty,     // offen(imm)
+     llvm_i32_ty,     // idxen(imm)
+     llvm_i32_ty,     // glc(imm)
+     llvm_i32_ty,     // slc(imm)
+     llvm_i32_ty],    // tfe(imm)
+    []>;
+
   // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
   def int_SI_tbuffer_store : Intrinsic <
     [],
Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
@@ -0,0 +1,43 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}test1:
+;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define amdgpu_vs <4 x i32> @test1(i32 %vaddr) {
+    %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> undef,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret <4 x i32> %vdata
+}
+
+;CHECK-LABEL: {{^}}test2:
+;CHECK: tbuffer_load_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define amdgpu_vs <4 x i32> @test2(i32 %vaddr) {
+    %vdata = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> undef,
+        i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret <4 x i32> %vdata
+}
+
+;CHECK-LABEL: {{^}}test3:
+;CHECK: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define amdgpu_vs <2 x i32> @test3(i32 %vaddr) {
+    %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> undef,
+        i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret <2 x i32> %vdata
+}
+
+;CHECK-LABEL: {{^}}test4:
+;CHECK: tbuffer_load_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define amdgpu_vs float @test4(i32 %vaddr) {
+    %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> undef,
+        i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    %vdata.f = bitcast i32 %vdata to float
+    ret float %vdata.f
+}
+
+declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
@@ -0,0 +1,45 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}test1:
+;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
+        i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test2:
+;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
+    call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> undef, <4 x i32> %vdata,
+        i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test3:
+;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) {
+    %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0
+    call void @llvm.amdgcn.tbuffer.store.v2i32(<4 x i32> undef, <2 x i32> %vdata,
+        i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+;CHECK-LABEL: {{^}}test4:
+;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0
+define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) {
+    call void @llvm.amdgcn.tbuffer.store.i32(<4 x i32> undef, i32 %vdata,
+        i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1,
+        i32 1, i32 0)
+    ret void
+}
+
+declare void @llvm.amdgcn.tbuffer.store.i32(<4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.tbuffer.store.v2i32(<4 x i32>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)