Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -475,6 +475,29 @@
 def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
 def int_amdgcn_buffer_store : AMDGPUBufferStore;
 
+def int_amdgcn_tbuffer_load : Intrinsic <
+    [llvm_anyint_ty], // overloaded for types i32, v2i32, v4i32
+    [llvm_v4i32_ty,   // rsrc(SGPR)
+     llvm_i32_ty,     // vaddr(VGPR)
+     llvm_i32_ty,     // offset(SGPR/VGPR/imm)
+     llvm_i32_ty,     // dfmt(imm)
+     llvm_i32_ty,     // nfmt(imm)
+     llvm_i1_ty,     // glc(imm)
+     llvm_i1_ty],    // slc(imm)
+    []>;
+
+def int_amdgcn_tbuffer_store : Intrinsic <
+    [],
+    [llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+     llvm_v4i32_ty,  // rsrc(SGPR)
+     llvm_i32_ty,    // vaddr(VGPR)
+     llvm_i32_ty,    // offset(SGPR/VGPR/imm)
+     llvm_i32_ty,    // dfmt(imm)
+     llvm_i32_ty,    // nfmt(imm)
+     llvm_i1_ty,    // glc(imm)
+     llvm_i1_ty],   // slc(imm)
+    []>;
+
 class AMDGPUBufferAtomic : Intrinsic <
   [llvm_i32_ty],
   [llvm_i32_ty,       // vdata(VGPR)
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -384,6 +384,7 @@
   STORE_MSKOR,
   LOAD_CONSTANT,
   TBUFFER_STORE_FORMAT,
+  TBUFFER_LOAD_FORMAT,
   ATOMIC_CMP_SWAP,
   ATOMIC_INC,
   ATOMIC_DEC,
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3527,6 +3527,7 @@
   NODE_NAME_CASE(STORE_MSKOR)
   NODE_NAME_CASE(LOAD_CONSTANT)
   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
+  NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
   NODE_NAME_CASE(ATOMIC_INC)
   NODE_NAME_CASE(ATOMIC_DEC)
Index: lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
===================================================================
--- lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -152,6 +152,8 @@
     ImmTyExpTgt,
     ImmTyExpCompr,
     ImmTyExpVM,
+    ImmTyDFMT,
+    ImmTyNFMT,
     ImmTyHwreg,
     ImmTyOff,
     ImmTySendMsg,
@@ -288,6 +290,8 @@
   bool isGLC() const { return isImmTy(ImmTyGLC); }
   bool isSLC() const { return isImmTy(ImmTySLC); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
+  bool isDFMT() const { return isImmTy(ImmTyDFMT) && isUInt<8>(getImm()); }
+  bool isNFMT() const { return isImmTy(ImmTyNFMT) && isUInt<8>(getImm()); }
   bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
   bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
   bool isBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
@@ -631,6 +635,8 @@
     case ImmTyGLC: OS << "GLC"; break;
     case ImmTySLC: OS << "SLC"; break;
     case ImmTyTFE: OS << "TFE"; break;
+    case ImmTyDFMT: OS << "DFMT"; break;
+    case ImmTyNFMT: OS << "NFMT"; break;
     case ImmTyClampSI: OS << "ClampSI"; break;
     case ImmTyOModSI: OS << "OModSI"; break;
     case ImmTyDppCtrl: OS << "DppCtrl"; break;
@@ -1003,6 +1009,8 @@
   void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false, false); }
   void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, false); }
   void cvtMubufAtomicReturn(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true, true); }
+  void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
+
   AMDGPUOperand::Ptr defaultGLC() const;
   AMDGPUOperand::Ptr defaultSLC() const;
   AMDGPUOperand::Ptr defaultTFE() const;
@@ -3453,6 +3461,44 @@
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
 }
 
+void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst,
+                               const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+
+    // Add the register arguments
+    if (Op.isReg()) {
+      Op.addRegOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle the case where soffset is an immediate
+    if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
+      Op.addImmOperands(Inst, 1);
+      continue;
+    }
+
+    // Handle tokens like 'offen' which are sometimes hard-coded into the
+    // asm string.  There are no MCInst operands for these.
+    if (Op.isToken()) {
+      continue;
+    }
+    assert(Op.isImm());
+
+    // Handle optional arguments
+    OptionalIdx[Op.getImmTy()] = i;
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyNFMT);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
+}
+
 //===----------------------------------------------------------------------===//
 // mimg
 //===----------------------------------------------------------------------===//
@@ -3625,6 +3671,8 @@
   {"offset1", AMDGPUOperand::ImmTyOffset1, false, nullptr},
   {"gds",     AMDGPUOperand::ImmTyGDS, true, nullptr},
   {"offset",  AMDGPUOperand::ImmTyOffset, false, nullptr},
+  {"dfmt",    AMDGPUOperand::ImmTyDFMT, false, nullptr},
+  {"nfmt",    AMDGPUOperand::ImmTyNFMT, false, nullptr},
   {"glc",     AMDGPUOperand::ImmTyGLC, true, nullptr},
   {"slc",     AMDGPUOperand::ImmTySLC, true, nullptr},
   {"tfe",     AMDGPUOperand::ImmTyTFE, true, nullptr},
Index: lib/Target/AMDGPU/BUFInstructions.td
===================================================================
--- lib/Target/AMDGPU/BUFInstructions.td
+++ lib/Target/AMDGPU/BUFInstructions.td
@@ -57,11 +57,16 @@
   string OpName = NAME # suffix;
 }
 
+class MTBUFAddr64Table <bit is_addr64, string suffix = ""> {
+  bit IsAddr64 = is_addr64;
+  string OpName = NAME # suffix;
+}
+
 //===----------------------------------------------------------------------===//
 // MTBUF classes
 //===----------------------------------------------------------------------===//
 
-class MTBUF_Pseudo <string opName, dag outs, dag ins,
+class MTBUF_Legacy_Pseudo <string opName, dag outs, dag ins,
                     string asmOps, list<dag> pattern=[]> :
   InstSI<outs, ins, "", pattern>,
   SIMCInstr<opName, SIEncodingFamily.NONE> {
@@ -82,11 +87,11 @@
   let hasSideEffects = 0;
   let SchedRW = [WriteVMEM];
 }
-
-class MTBUF_Real <MTBUF_Pseudo ps> :
+ 
+class MTBUF_Legacy_Real <MTBUF_Legacy_Pseudo ps> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
   Enc64 {
-
+ 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
 
@@ -96,7 +101,7 @@
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
-
+ 
   bits<8> vdata;
   bits<12> offset;
   bits<1> offen;
@@ -126,18 +131,171 @@
   let Inst{63-56} = soffset;
 }
 
-class MTBUF_Load_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
-  opName, (outs regClass:$dst),
-  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr, SReg_128:$srsrc,
-       i1imm:$slc, i1imm:$tfe, SCSrc_b32:$soffset),
-  " $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"#
-  " $nfmt, $vaddr, $srsrc, $slc, $tfe, $soffset"> {
+class MTBUF_Pseudo <string opName, dag outs, dag ins,
+                    string asmOps, list<dag> pattern=[]> :
+  InstSI<outs, ins, "", pattern>,
+  SIMCInstr<opName, SIEncodingFamily.NONE> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let Size = 8;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = asmOps;
+
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+  let MTBUF = 1;
+  let Uses = [EXEC];
+  //let hasSideEffects = 0;
+  let SchedRW = [WriteVMEM];
+
+  let AsmMatchConverter = "cvtMtbuf";
+
+  bits<1> offen       = 0;
+  bits<1> idxen       = 0;
+  bits<1> addr64      = 0;
+  bits<1> has_vdata   = 1;
+  bits<1> has_vaddr   = 1;
+  bits<1> has_glc     = 1;
+  bits<1> glc_value   = 0; // the value for glc if no such operand
+  bits<4> dfmt_value  = 1; // the value for dfmt if no such operand
+  bits<3> nfmt_value  = 0; // the value for nfmt if no such operand
+  bits<1> has_srsrc   = 1;
+  bits<1> has_soffset = 1;
+  bits<1> has_offset  = 1;
+  bits<1> has_slc     = 1;
+  bits<1> has_tfe     = 1;
+  bits<1> has_dfmt    = 1;
+  bits<1> has_nfmt    = 1;
+}
+
+class MTBUF_Real <MTBUF_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  // copy relevant pseudo op flags
+  let SubtargetPredicate = ps.SubtargetPredicate;
+  let AsmMatchConverter  = ps.AsmMatchConverter;
+  let Constraints        = ps.Constraints;
+  let DisableEncoding    = ps.DisableEncoding;
+  let TSFlags            = ps.TSFlags;
+
+  bits<12> offset;
+  bits<1>  glc;
+  bits<4>  dfmt;
+  bits<3>  nfmt;
+  bits<8>  vaddr;
+  bits<8>  vdata;
+  bits<7>  srsrc;
+  bits<1>  slc;
+  bits<1>  tfe;
+  bits<8>  soffset;
+}
+
+class getMTBUFInsDA<list<RegisterClass> vdataList,
+                    list<RegisterClass> vaddrList=[]> {
+  RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
+  RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
+  dag InsNoData = !if(!empty(vaddrList),
+    (ins                    SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+    (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
+         offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+  );
+  dag InsData = !if(!empty(vaddrList),
+    (ins vdataClass:$vdata,                    SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe),
+    (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
+         SCSrc_b32:$soffset, offset:$offset, DFMT:$dfmt, NFMT:$nfmt, GLC:$glc, slc:$slc, tfe:$tfe)
+  );
+  dag ret = !if(!empty(vdataList), InsNoData, InsData);
+}
+
+class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+  dag ret =
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+    (ins))))));
+}
+
+class getMTBUFAsmOps<int addrKind> {
+  string Pfx =
+    !if(!eq(addrKind, BUFAddrKind.Offset), "off, $srsrc, $dfmt, $nfmt, $soffset",
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  "$vaddr, $srsrc, $dfmt, $nfmt, $soffset offen",
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen",
+    !if(!eq(addrKind, BUFAddrKind.BothEn), "$vaddr, $srsrc, $dfmt, $nfmt, $soffset idxen offen",
+    !if(!eq(addrKind, BUFAddrKind.Addr64), "$vaddr, $srsrc, $dfmt, $nfmt, $soffset addr64",
+    "")))));
+  string ret = Pfx # "$offset";
+}
+
+class MTBUF_SetupAddr<int addrKind> {
+  bits<1> offen  = !if(!eq(addrKind, BUFAddrKind.OffEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> idxen  = !if(!eq(addrKind, BUFAddrKind.IdxEn), 1,
+                   !if(!eq(addrKind, BUFAddrKind.BothEn), 1 , 0));
+
+  bits<1> addr64 = !if(!eq(addrKind, BUFAddrKind.Addr64), 1, 0);
+
+  bits<1> has_vaddr = !if(!eq(addrKind, BUFAddrKind.Offset), 0, 1);
+}
+
+class MTBUF_Load_Pseudo <string opName,
+                         int addrKind,
+                         RegisterClass vdataClass,
+                         list<dag> pattern=[],
+                         // Workaround bug bz30254
+                         int addrKindCopy = addrKind>
+  : MTBUF_Pseudo<opName,
+                 (outs vdataClass:$vdata),
+                 getMTBUFIns<addrKindCopy>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
   let mayLoad = 1;
   let mayStore = 0;
 }
 
-class MTBUF_Store_Pseudo <string opName, RegisterClass regClass> : MTBUF_Pseudo <
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+                              ValueType load_vt = i32,
+                              SDPatternOperator ld = null_frag> {
+
+  // We use some MUBUF patterns here as they are the same for MTBUF
+  // MUBUFOffset and MUBUFAddr64
+                         
+  def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(set load_vt:$vdata,
+     (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe)))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+// Legacy tbuffer store support
+class MTBUF_Store_Legacy_Pseudo <string opName, RegisterClass regClass> : MTBUF_Legacy_Pseudo <
   opName, (outs),
   (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
        i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VGPR_32:$vaddr,
@@ -148,6 +306,54 @@
   let mayStore = 1;
 }
 
+class MTBUF_Store_Pseudo <string opName,
+                          int addrKind,
+                          RegisterClass vdataClass,
+                          list<dag> pattern=[],
+                          // Workaround bug bz30254
+                          int addrKindCopy = addrKind,
+                          RegisterClass vdataClassCopy = vdataClass>
+  : MTBUF_Pseudo<opName,
+                 (outs),
+                 getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+                 " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe",
+                 pattern>,
+    MTBUF_SetupAddr<addrKindCopy> {
+  let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
+  //let mayLoad = 0;
+  //let mayStore = 1;
+}
+
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+                               ValueType store_vt = i32,
+                               SDPatternOperator st = null_frag> {
+
+  // We use some MUBUF patterns here as they are the same for MTBUF
+  // MUBUFOffset and MUBUFAddr64                       
+
+  def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+    [(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<0>;
+
+  def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+    [(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
+                                       i16:$offset, i8:$dfmt, i8:$nfmt, i1:$glc, i1:$slc, i1:$tfe))]>,
+    MTBUFAddr64Table<1>;
+
+  def _OFFEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+  def _IDXEN  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+  def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
+    def _OFFEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+    def _IDXEN_exact  : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+    def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+  }
+}
+
+
 //===----------------------------------------------------------------------===//
 // MUBUF classes
 //===----------------------------------------------------------------------===//
@@ -676,14 +882,20 @@
 // MTBUF Instructions
 //===----------------------------------------------------------------------===//
 
-//def TBUFFER_LOAD_FORMAT_X    : MTBUF_ <0, "tbuffer_load_format_x", []>;
-//def TBUFFER_LOAD_FORMAT_XY   : MTBUF_ <1, "tbuffer_load_format_xy", []>;
-//def TBUFFER_LOAD_FORMAT_XYZ  : MTBUF_ <2, "tbuffer_load_format_xyz", []>;
-def TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Load_Pseudo  <"tbuffer_load_format_xyzw", VReg_128>;
-def TBUFFER_STORE_FORMAT_X    : MTBUF_Store_Pseudo <"tbuffer_store_format_x", VGPR_32>;
-def TBUFFER_STORE_FORMAT_XY   : MTBUF_Store_Pseudo <"tbuffer_store_format_xy", VReg_64>;
-def TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Store_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
-def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Pseudo_Loads  <"tbuffer_load_format_x",     VGPR_32>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xy",    VReg_64>;
+defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyz",   VReg_96>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Pseudo_Loads  <"tbuffer_load_format_xyzw",  VReg_128>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Pseudo_Stores <"tbuffer_store_format_x",    VGPR_32>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy",   VReg_64>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz",  VReg_96>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
+
+// Legacy tbuffer_store support
+def TBUFFER_STORE_FORMAT_LEGACY_X    : MTBUF_Store_Legacy_Pseudo <"tbuffer_store_format_x", VGPR_32>;
+def TBUFFER_STORE_FORMAT_LEGACY_XY   : MTBUF_Store_Legacy_Pseudo <"tbuffer_store_format_xy", VReg_64>;
+def TBUFFER_STORE_FORMAT_LEGACY_XYZ  : MTBUF_Store_Legacy_Pseudo <"tbuffer_store_format_xyz", VReg_128>;
+def TBUFFER_STORE_FORMAT_LEGACY_XYZW : MTBUF_Store_Legacy_Pseudo <"tbuffer_store_format_xyzw", VReg_128>;
 
 } // End let SubtargetPredicate = isGCN
 
@@ -1093,8 +1305,12 @@
 // MTBUF Patterns
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// tbuffer_store_format legacy patterns
+//===----------------------------------------------------------------------===//
+
 // TBUFFER_STORE_FORMAT_*, addr64=0
-class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Pseudo opcode> : Pat<
+class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF_Legacy_Pseudo opcode> : Pat<
   (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
                    i32:$soffset, imm:$inst_offset, imm:$dfmt,
                    imm:$nfmt, imm:$offen, imm:$idxen,
@@ -1105,10 +1321,99 @@
     (as_i1imm $slc), (as_i1imm $tfe), $soffset)
 >;
 
-def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
-def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
-def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
-def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
+def : MTBUF_StoreResource <i32,   1, TBUFFER_STORE_FORMAT_LEGACY_X>;
+def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_LEGACY_XY>;
+def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_LEGACY_XYZ>;
+def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_LEGACY_XYZW>;
+
+//===----------------------------------------------------------------------===//
+// tbuffer_load/store_format patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc)),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32,   "TBUFFER_LOAD_FORMAT_X">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
+defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4i32, "TBUFFER_LOAD_FORMAT_XYZW">;
+
+multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+                                    (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt), (as_i8imm $nfmt),
+                                   (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i8imm $dfmt), (as_i8imm $nfmt),
+                                   (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$dfmt, imm:$nfmt, imm:$glc, imm:$slc),
+    (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i8imm $dfmt), (as_i8imm $nfmt), (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MTBUF_StoreIntrinsicPat<int_amdgcn_tbuffer_store, i32,   "TBUFFER_STORE_FORMAT_X">;
+defm : MTBUF_StoreIntrinsicPat<int_amdgcn_tbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
+defm : MTBUF_StoreIntrinsicPat<int_amdgcn_tbuffer_store, v4i32, "TBUFFER_STORE_FORMAT_XYZW">;
 
 } // End let Predicates = [isGCN]
 
@@ -1224,21 +1529,60 @@
 
 class MTBUF_Real_si <bits<3> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
   let AssemblerPredicate=isSICI;
   let DecoderNamespace="SICI";
 
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
+  let Inst{15}    = ps.addr64;
+  let Inst{18-16} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);  
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
+}
+
+multiclass MTBUF_Real_AllAddr_si<bits<3> op> {
+  def _OFFSET_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _ADDR64_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_ADDR64")>;
+  def _OFFEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_si  : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_si : MTBUF_Real_si <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_si <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_si <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_si <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_si <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_si <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_si <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_si <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_si <7>;
+
+class MTBUF_Legacy_Real_si <bits<3> op, MTBUF_Legacy_Pseudo ps> :
+  MTBUF_Legacy_Real<ps>,
+  SIMCInstr<ps.PseudoInstr, SIEncodingFamily.SI> {
+  let AssemblerPredicate=isSICI;
+  let DecoderNamespace="SICI";
+ 
   bits<1> addr64;
   let Inst{15}    = addr64;
   let Inst{18-16} = op;
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_si  : MTBUF_Real_si <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_si    : MTBUF_Real_si <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_si   : MTBUF_Real_si <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_si  : MTBUF_Real_si <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_si : MTBUF_Real_si <7, TBUFFER_STORE_FORMAT_XYZW>;
-
+def TBUFFER_STORE_FORMAT_LEGACY_X_si    : MTBUF_Legacy_Real_si <4, TBUFFER_STORE_FORMAT_LEGACY_X>;
+def TBUFFER_STORE_FORMAT_LEGACY_XY_si   : MTBUF_Legacy_Real_si <5, TBUFFER_STORE_FORMAT_LEGACY_XY>;
+def TBUFFER_STORE_FORMAT_LEGACY_XYZ_si  : MTBUF_Legacy_Real_si <6, TBUFFER_STORE_FORMAT_LEGACY_XYZ>;
+def TBUFFER_STORE_FORMAT_LEGACY_XYZW_si : MTBUF_Legacy_Real_si <7, TBUFFER_STORE_FORMAT_LEGACY_XYZW>;
 
 //===----------------------------------------------------------------------===//
 // CI
@@ -1350,16 +1694,53 @@
 
 class MTBUF_Real_vi <bits<4> op, MTBUF_Pseudo ps> :
   MTBUF_Real<ps>,
+  Enc64,
   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
   let AssemblerPredicate=isVI;
   let DecoderNamespace="VI";
 
+  let Inst{11-0}  = !if(ps.has_offset, offset, ?);
+  let Inst{12}    = ps.offen;
+  let Inst{13}    = ps.idxen;
+  let Inst{14}    = !if(ps.has_glc, glc, ps.glc_value);
   let Inst{18-15} = op;
+  let Inst{22-19} = !if(ps.has_dfmt, dfmt, ps.dfmt_value);
+  let Inst{25-23} = !if(ps.has_nfmt, nfmt, ps.nfmt_value);
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = !if(ps.has_vaddr, vaddr, ?);
+  let Inst{47-40} = !if(ps.has_vdata, vdata, ?);
+  let Inst{52-48} = !if(ps.has_srsrc, srsrc{6-2}, ?);
+  let Inst{54}    = !if(ps.has_slc, slc, ?);
+  let Inst{55}    = !if(ps.has_tfe, tfe, ?);
+  let Inst{63-56} = !if(ps.has_soffset, soffset, ?);
 }
 
-def TBUFFER_LOAD_FORMAT_XYZW_vi  : MTBUF_Real_vi <3, TBUFFER_LOAD_FORMAT_XYZW>;
-def TBUFFER_STORE_FORMAT_X_vi    : MTBUF_Real_vi <4, TBUFFER_STORE_FORMAT_X>;
-def TBUFFER_STORE_FORMAT_XY_vi   : MTBUF_Real_vi <5, TBUFFER_STORE_FORMAT_XY>;
-def TBUFFER_STORE_FORMAT_XYZ_vi  : MTBUF_Real_vi <6, TBUFFER_STORE_FORMAT_XYZ>;
-def TBUFFER_STORE_FORMAT_XYZW_vi : MTBUF_Real_vi <7, TBUFFER_STORE_FORMAT_XYZW>;
+multiclass MTBUF_Real_AllAddr_vi<bits<4> op> {
+  def _OFFSET_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET")>;
+  def _OFFEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN")>;
+  def _IDXEN_vi  : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN")>;
+  def _BOTHEN_vi : MTBUF_Real_vi <op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+}
+
+defm TBUFFER_LOAD_FORMAT_X     : MTBUF_Real_AllAddr_vi <0>;
+defm TBUFFER_LOAD_FORMAT_XY    : MTBUF_Real_AllAddr_vi <1>;
+//defm TBUFFER_LOAD_FORMAT_XYZ   : MTBUF_Real_AllAddr_vi <2>;
+defm TBUFFER_LOAD_FORMAT_XYZW  : MTBUF_Real_AllAddr_vi <3>;
+defm TBUFFER_STORE_FORMAT_X    : MTBUF_Real_AllAddr_vi <4>;
+defm TBUFFER_STORE_FORMAT_XY   : MTBUF_Real_AllAddr_vi <5>;
+defm TBUFFER_STORE_FORMAT_XYZ  : MTBUF_Real_AllAddr_vi <6>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_vi <7>;
+
+class MTBUF_Legacy_Real_vi <bits<4> op, MTBUF_Legacy_Pseudo ps> :
+   MTBUF_Legacy_Real<ps>,
+   SIMCInstr<ps.PseudoInstr, SIEncodingFamily.VI> {
+   let AssemblerPredicate=isVI;
+   let DecoderNamespace="VI";
+ 
+   let Inst{18-15} = op;
+}
 
+def TBUFFER_STORE_FORMAT_LEGACY_X_vi    : MTBUF_Legacy_Real_vi <4, TBUFFER_STORE_FORMAT_LEGACY_X>;
+def TBUFFER_STORE_FORMAT_LEGACY_XY_vi   : MTBUF_Legacy_Real_vi <5, TBUFFER_STORE_FORMAT_LEGACY_XY>;
+def TBUFFER_STORE_FORMAT_LEGACY_XYZ_vi  : MTBUF_Legacy_Real_vi <6, TBUFFER_STORE_FORMAT_LEGACY_XYZ>;
+def TBUFFER_STORE_FORMAT_LEGACY_XYZW_vi : MTBUF_Legacy_Real_vi <7, TBUFFER_STORE_FORMAT_LEGACY_XYZW>;
Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
===================================================================
--- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -84,7 +84,11 @@
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printExpVM(const MCInst *MI, unsigned OpNo,
                   const MCSubtargetInfo &STI, raw_ostream &O);
-
+  void printDFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNFMT(const MCInst *MI, unsigned OpNo,
+                 const MCSubtargetInfo &STI, raw_ostream &O);
+  
   void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
Index: lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -216,6 +216,24 @@
     O << " vm";
 }
 
+void AMDGPUInstPrinter::printDFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " dfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
+void AMDGPUInstPrinter::printNFMT(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  if (MI->getOperand(OpNo).getImm()) {
+    O << " nfmt:";
+    printU8ImmDecOperand(MI, OpNo, O);
+  }
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
                                         const MCRegisterInfo &MRI) {
   switch (RegNo) {
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3128,6 +3128,8 @@
                                                  SelectionDAG &DAG) const {
   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   SDLoc DL(Op);
+  MachineFunction &MF = DAG.getMachineFunction();
+
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
   case Intrinsic::amdgcn_atomic_dec: {
@@ -3153,7 +3155,6 @@
       Op.getOperand(5), // glc
       Op.getOperand(6)  // slc
     };
-    MachineFunction &MF = DAG.getMachineFunction();
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
@@ -3168,6 +3169,27 @@
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
   }
+  case Intrinsic::amdgcn_tbuffer_load: {
+    SDValue Ops[] = {
+      Op.getOperand(0),  // Chain
+      Op.getOperand(2),  // rsrc
+      Op.getOperand(3),  // vaddr
+      Op.getOperand(4),  // offset
+      Op.getOperand(5),  // dfmt
+      Op.getOperand(6),  // nfmt
+      Op.getOperand(7),  // glc
+      Op.getOperand(8)   // slc
+    };
+
+    EVT VT = Op.getOperand(2).getValueType();
+
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad,
+      VT.getStoreSize(), VT.getStoreSize());
+    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
+                                   Op->getVTList(), Ops, VT, MMO);    
+  }
   // Basic sample.
   case Intrinsic::amdgcn_image_sample:
   case Intrinsic::amdgcn_image_sample_cl:
@@ -3233,11 +3255,11 @@
 
 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                                               SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-
+  MachineFunction &MF = DAG.getMachineFunction();
+  
   switch (IntrinsicID) {
   case Intrinsic::amdgcn_exp: {
     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
@@ -3303,33 +3325,6 @@
     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
                        Op.getOperand(2), Op.getOperand(3));
   }
-  case AMDGPUIntrinsic::SI_tbuffer_store: {
-    SDValue Ops[] = {
-      Chain,
-      Op.getOperand(2),
-      Op.getOperand(3),
-      Op.getOperand(4),
-      Op.getOperand(5),
-      Op.getOperand(6),
-      Op.getOperand(7),
-      Op.getOperand(8),
-      Op.getOperand(9),
-      Op.getOperand(10),
-      Op.getOperand(11),
-      Op.getOperand(12),
-      Op.getOperand(13),
-      Op.getOperand(14)
-    };
-
-    EVT VT = Op.getOperand(3).getValueType();
-
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(),
-      MachineMemOperand::MOStore,
-      VT.getStoreSize(), 4);
-    return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                   Op->getVTList(), Ops, VT, MMO);
-  }
   case AMDGPUIntrinsic::AMDGPU_kill: {
     SDValue Src = Op.getOperand(2);
     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
@@ -3345,7 +3340,6 @@
   }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
-      const MachineFunction &MF = DAG.getMachineFunction();
       const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
       unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
       if (WGSize <= ST.getWavefrontSize())
@@ -3354,6 +3348,32 @@
     }
     return SDValue();
   };
+  case AMDGPUIntrinsic::SI_tbuffer_store: {
+   SDValue Ops[] = {
+     Chain,
+     Op.getOperand(2),
+     Op.getOperand(3),
+     Op.getOperand(4),
+     Op.getOperand(5),
+     Op.getOperand(6),
+     Op.getOperand(7),
+     Op.getOperand(8),
+     Op.getOperand(9),
+     Op.getOperand(10),
+     Op.getOperand(11),
+     Op.getOperand(12),
+     Op.getOperand(13),
+     Op.getOperand(14)
+   };
+   EVT VT = Op.getOperand(3).getValueType();
+   MachineMemOperand *MMO = MF.getMachineMemOperand(
+     MachinePointerInfo(),
+     MachineMemOperand::MOStore,
+     VT.getStoreSize(), 4);
+   return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                  Op->getVTList(), Ops, VT, MMO);
+  }
+
   default:
     return Op;
   }
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -39,6 +39,7 @@
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
+// Legacy tbuffer_store support (llvm.SI.tbuffer.store)
 def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
   SDTypeProfile<0, 13,
     [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
@@ -58,6 +59,20 @@
   [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SItbuffer_load : SDNode<"AMDGPUISD::TBUFFER_LOAD_FORMAT",
+  SDTypeProfile<1, 7,
+    [                     // vdata
+     SDTCisVT<1, v4i32>,  // rsrc
+     SDTCisVT<2, i32>,    // vaddr
+     SDTCisVT<3, i32>,    // offset
+     SDTCisVT<4, i32>,    // dfmt(imm)
+     SDTCisVT<5, i32>,    // nfmt(imm)
+     SDTCisVT<6, i32>,    // glc(imm)
+     SDTCisVT<7, i32>     // slc(imm)
+    ]>,
+  [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
+>;
+
 def SDTBufferLoad : SDTypeProfile<1, 5,
     [                    // vdata
      SDTCisVT<1, v4i32>, // rsrc
@@ -499,6 +514,9 @@
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
 def exp_vm : NamedOperandBit<"ExpVM", NamedMatchClass<"ExpVM">>;
 
+def DFMT : NamedOperandU8<"DFMT", NamedMatchClass<"DFMT">>;
+def NFMT : NamedOperandU8<"NFMT", NamedMatchClass<"NFMT">>;
+
 def dmask : NamedOperandU16<"DMask", NamedMatchClass<"DMask">>;
 
 def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.load.ll
@@ -0,0 +1,127 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI
+
+;CHECK-LABEL: {{^}}tbuffer_load:
+;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0
+;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:3, 0 glc
+;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:6, nfmt:1, 0 slc
+;CHECK: s_waitcnt
+define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load(<4 x i32> inreg) {
+main_body:
+    %vdata     = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 15, i32 3, i1 1, i1 0)        
+    %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 0, i32 6, i32 1, i1 0, i1 1)        
+    %vdata.f     = bitcast <4 x i32> %vdata to <4 x float>
+    %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
+    %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
+    %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
+    %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
+    %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
+    ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+;CHECK-LABEL: {{^}}tbuffer_load_immoffs:
+;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offset:42
+define amdgpu_vs <4 x float> @tbuffer_load_immoffs(<4 x i32> inreg) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 42, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+;CHECK-LABEL: {{^}}tbuffer_load_immoffs_large
+;SICI-DAG: v_mov_b32_e32 [[VOFS1:v[0-9]+]], 0x103c
+;SICI-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS1]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 0 offen
+;VI-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:15, nfmt:2, 61 offset:4095
+
+;SICI-DAG: v_mov_b32_e32 [[VOFS2:v[0-9]+]], 0x8048
+;SICI-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS2]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, 0 offen
+;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff
+;VI-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:3, [[OFS1]] offset:73
+
+;SICI-DAG: v_mov_b32_e32 [[VOFS3:v[0-9]+]], 0x9000
+;SICI-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS3]], {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0 offen
+;VI-DAG: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff
+;VI-DAG: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, [[OFS2]] offset:1
+
+;CHECK: s_waitcnt
+define amdgpu_vs {<4 x float>, <4 x float>, <4 x float>} @tbuffer_load_immoffs_large(<4 x i32> inreg) {
+    %vdata     = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 4156, i32 15, i32 2, i1 0, i1 0)
+    %vdata_glc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 32840, i32 14, i32 3, i1 0, i1 0)        
+    %vdata_slc = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 36864, i32 13, i32 4, i1 0, i1 0)        
+    %vdata.f     = bitcast <4 x i32> %vdata to <4 x float>
+    %vdata_glc.f = bitcast <4 x i32> %vdata_glc to <4 x float>
+    %vdata_slc.f = bitcast <4 x i32> %vdata_slc to <4 x float>
+    %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %vdata.f, 0
+    %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %vdata_glc.f, 1
+    %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %vdata_slc.f, 2
+    ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+;CHECK-LABEL: {{^}}tbuffer_load_idx:
+;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen
+define amdgpu_vs <4 x float> @tbuffer_load_idx(<4 x i32> inreg, i32 %vindex) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 0, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+;CHECK-LABEL: {{^}}tbuffer_load_ofs:
+;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen
+define amdgpu_vs <4 x float> @tbuffer_load_ofs(<4 x i32> inreg, i32 %offs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %offs, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+;CHECK-LABEL: {{^}}tbuffer_load_ofs_imm:
+;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 offen offset:52
+define amdgpu_vs <4 x float> @tbuffer_load_ofs_imm(<4 x i32> inreg, i32 %inval) {
+main_body:
+    %offs = add i32 %inval, 52
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 0, i32 %offs, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+;CHECK-LABEL: {{^}}tbuffer_load_both:
+;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen
+define amdgpu_vs <4 x float> @tbuffer_load_both(<4 x i32> inreg, i32 %vindex, i32 %offs) {
+main_body:
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %offs, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+;CHECK-LABEL: {{^}}tbuffer_load_both_inst_offs:
+;CHECK: tbuffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, dfmt:14, nfmt:4, 0 idxen offen offset:52
+define amdgpu_vs <4 x float> @tbuffer_load_both_inst_offs(<4 x i32> inreg, i32 %vindex, i32 %offs) {
+main_body:
+    %offs.2  = add i32 %offs, 52
+    %vdata   = call <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32> %0, i32 %vindex, i32 %offs.2, i32 14, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <4 x i32> %vdata to <4 x float>
+    ret <4 x float> %vdata.f
+}
+
+
+;CHECK-LABEL: {{^}}buffer_load_xy:
+;CHECK: tbuffer_load_format_xy {{v\[[0-9]+:[0-9]+\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0
+define amdgpu_vs <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
+    %vdata = call <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32> %rsrc, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast <2 x i32> %vdata to <2 x float>
+    ret <2 x float> %vdata.f
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x:
+;CHECK: tbuffer_load_format_x {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, dfmt:13, nfmt:4, 0
+define amdgpu_vs float @buffer_load_x(<4 x i32> inreg %rsrc) {
+    %vdata = call i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32> %rsrc, i32 0, i32 0, i32 13, i32 4, i1 0, i1 0)
+    %vdata.f = bitcast i32 %vdata to float
+    ret float %vdata.f
+}
+
+declare i32 @llvm.amdgcn.tbuffer.load.i32(<4 x i32>, i32, i32, i32, i32, i1, i1)
+declare <2 x i32> @llvm.amdgcn.tbuffer.load.v2i32(<4 x i32>, i32, i32, i32, i32, i1, i1)
+declare <4 x i32> @llvm.amdgcn.tbuffer.load.v4i32(<4 x i32>, i32, i32, i32, i32, i1, i1)
Index: test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.tbuffer.store.ll
@@ -0,0 +1,108 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}tbuffer_store:
+;CHECK: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:12, nfmt:2, 0
+;CHECK: tbuffer_store_format_xyzw v[4:7], off, s[0:3], dfmt:13, nfmt:3, 0 glc
+;CHECK: tbuffer_store_format_xyzw v[8:11], off, s[0:3], dfmt:14, nfmt:4, 0 slc
+define amdgpu_ps void @tbuffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  %in2 = bitcast <4 x float> %2 to <4 x i32>
+  %in3 = bitcast <4 x float> %3 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 0, i32 12, i32 2, i1 0, i1 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in2, <4 x i32> %0, i32 0, i32 0, i32 13, i32 3, i1 1, i1 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in3, <4 x i32> %0, i32 0, i32 0, i32 14, i32 4, i1 0, i1 1)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}tbuffer_store_immoffs:
+;CHECK: tbuffer_store_format_xyzw v[0:3], off, s[0:3], dfmt:5, nfmt:7, 0 offset:42
+define amdgpu_ps void @tbuffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 42, i32 5, i32 7, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_idx:
+;CHECK: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %2, i32 0, i32 15, i32 2, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_ofs:
+;CHECK: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:3, nfmt:7, 0 offen
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 0, i32 %2, i32 3, i32 7, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both:
+;CHECK: tbuffer_store_format_xyzw v[0:3], v[4:5], s[0:3], dfmt:6, nfmt:4, 0 idxen offen
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %2, i32 %3, i32 6, i32 4, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both_reversed:
+;CHECK: v_mov_b32_e32 v6, v4
+;CHECK: tbuffer_store_format_xyzw v[0:3], v[5:6], s[0:3], dfmt:15, nfmt:3, 0 idxen offen
+define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %3, i32 %2, i32 15, i32 3, i1 0, i1 0)
+  ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+;CHECK-LABEL: {{^}}buffer_store_wait:
+;CHECK: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], dfmt:15, nfmt:3, 0 idxen
+;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: tbuffer_store_format_xyzw v[0:3], v6, s[0:3], dfmt:16, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+main_body:
+  %in1 = bitcast <4 x float> %1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %in1, <4 x i32> %0, i32 %2, i32 0, i32 15, i32 3, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
+  %data.i = bitcast <4 x float> %data to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %data.i, <4 x i32> %0, i32 %4, i32 0, i32 16, i32 2, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK: tbuffer_store_format_x v0, v1, s[0:3], dfmt:13, nfmt:7, 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+main_body:
+  %data.i = bitcast float %data to i32
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %data.i, <4 x i32> %rsrc, i32 %index, i32 0, i32 13, i32 7, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK: tbuffer_store_format_xy v[0:1], v2, s[0:3], dfmt:1, nfmt:2, 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) {
+main_body:
+  %data.i = bitcast <2 x float> %data to <2 x i32>
+  call void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32> %data.i, <4 x i32> %rsrc, i32 %index, i32 0, i32 1, i32 2, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
Index: test/CodeGen/AMDGPU/merge-store-crash.ll
===================================================================
--- test/CodeGen/AMDGPU/merge-store-crash.ll
+++ test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -26,11 +26,11 @@
   %tmp9 = insertelement <4 x i32> %tmp8, i32 %tmp7, i32 1
   %tmp10 = insertelement <4 x i32> %tmp9, i32 undef, i32 2
   %tmp11 = insertelement <4 x i32> %tmp10, i32 undef, i32 3
-  call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %tmp11, i32 4, i32 undef, i32 %arg, i32 0, i32 14, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %tmp11, <4 x i32> undef, i32 undef, i32 %arg, i32 14, i32 4, i1 1, i1 1)
   ret void
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i1, i1) #0
 
 attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/merge-store-usedef.ll
===================================================================
--- test/CodeGen/AMDGPU/merge-store-usedef.ll
+++ test/CodeGen/AMDGPU/merge-store-usedef.ll
@@ -11,13 +11,13 @@
 
   store i32 %v, i32 addrspace(3)* %p0
 
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %v, i32 1, i32 undef, i32 undef, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %v, <4 x i32> undef, i32 0, i32 0, i32 4, i32 4, i1 1, i1 0)
 
   %w = load i32, i32 addrspace(3)* %p0
   store i32 %w, i32 addrspace(3)* %p1
   ret void
 }
 
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i1, i1) #0
 
 attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/mubuf.ll
===================================================================
--- test/CodeGen/AMDGPU/mubuf.ll
+++ test/CodeGen/AMDGPU/mubuf.ll
@@ -62,7 +62,8 @@
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 64, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 %tmp4, i32 %4, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
@@ -80,7 +81,8 @@
   %tmp2 = shl i32 %6, 2
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> %tmp1, i32 %tmp2, i32 65, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
   %tmp4 = add i32 %6, 16
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> %tmp1, i32 %tmp3, i32 1, i32 %tmp4, i32 %4, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %tmp1.4xi32 = bitcast <16 x i8> %tmp1 to <4 x i32>
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> %tmp1.4xi32, i32 %tmp4, i32 %4, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
@@ -175,6 +177,6 @@
 }
 
 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i1, i1)
 
 attributes #0 = { nounwind readonly }
Index: test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
===================================================================
--- test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
+++ test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
@@ -25,29 +25,29 @@
   %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2
   %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3
   %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp3, i32 1, i32 36, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp3, <4 x i32> undef, i32 36, i32 %arg, i32 4, i32 4, i1 1, i1 1)
   %bc = bitcast <4 x float> %array_vector3 to <4 x i32>
   %tmp4 = extractelement <4 x i32> %bc, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp4, i32 1, i32 48, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp4, <4 x i32> undef, i32 48, i32 %arg, i32 4, i32 4, i1 1, i1 1)
   %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32>
   %tmp5 = extractelement <4 x i32> %bc49, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp5, i32 1, i32 72, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp5, <4 x i32> undef, i32 72, i32 %arg, i32 4, i32 4, i1 1, i1 1)
   %array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1
   %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2
   %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 28, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 28, i32 %arg, i32 4, i32 4, i1 1, i1 1)
   %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32>
   %tmp6 = extractelement <4 x i32> %bc52, i32 undef
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp6, i32 1, i32 64, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 20, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 56, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
-  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 92, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp6, <4 x i32> undef, i32 64, i32 %arg, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 20, i32 %arg, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 56, i32 %arg, i32 4, i32 4, i1 1, i1 1)
+  call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 92, i32 %arg, i32 4, i32 4, i1 1, i1 1)
   ret void
 }
 
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i1, i1) #3
 
 attributes #0 = { nounwind "target-cpu"="tonga" }
 attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
===================================================================
--- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
 
-declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
-declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
+declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i1, i1)
+declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i1, i1)
 declare void @llvm.amdgcn.s.barrier() #1
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
@@ -258,9 +258,8 @@
 ;   %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4
 
 ;   %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0
-;   call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata,
-;         i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1,
-;         i32 1, i32 0)
+;   call void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32> %vdata, <4 x i32> undef,
+;         i32 %vaddr, i32 32, i32 14, i32 4, i1 1, i1 1)
 
 ;   %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4