Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -99,6 +99,19 @@
   GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">,
   Intrinsic<[], [], []>;
 
+// SI+
+def int_amdgcn_buffer_load_format :
+  Intrinsic<[llvm_anyfloat_ty],
+            [llvm_v4i32_ty,  // rsrc(SGPR)
+             llvm_i32_ty,    // sgpr_offset(SGPR or 0)
+             llvm_i32_ty,    // inst_offset(imm)
+             llvm_i32_ty,    // vgpr_offset(VGPR or 0)
+             llvm_i32_ty,    // vgpr_index(VGPR or 0)
+             llvm_i1_ty,     // glc
+             llvm_i1_ty,     // slc
+             llvm_i1_ty],    // tfe
+            [IntrNoMem]>;
+
 // On CI+
 def int_amdgcn_buffer_wbinvl1_vol :
   GCCBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">,
Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -41,6 +41,7 @@
 FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
+FunctionPass *createSILoadShrinkPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixControlFlowLiveIntervalsPass();
@@ -56,6 +57,9 @@
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
+void initializeSILoadShrinkPass(PassRegistry &);
+extern char &SILoadShrinkPassID;
+
 void initializeSILoadStoreOptimizerPass(PassRegistry &);
 extern char &SILoadStoreOptimizerID;
 
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -48,6 +48,7 @@
   initializeSIFoldOperandsPass(*PR);
   initializeSIFixSGPRLiveRangesPass(*PR);
   initializeSIFixControlFlowLiveIntervalsPass(*PR);
+  initializeSILoadShrinkPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
 }
 
@@ -165,6 +166,7 @@
 public:
   GCNPassConfig(TargetMachine *TM, PassManagerBase &PM)
     : AMDGPUPassConfig(TM, PM) { }
+  void addCodeGenPrepare() override;
   bool addPreISel() override;
   bool addInstSelector() override;
   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
@@ -265,6 +267,11 @@
 // GCN Pass Setup
 //===----------------------------------------------------------------------===//
 
+void GCNPassConfig::addCodeGenPrepare() {
+  addPass(createSILoadShrinkPass());
+  AMDGPUPassConfig::addCodeGenPrepare();
+}
+
 bool GCNPassConfig::addPreISel() {
   AMDGPUPassConfig::addPreISel();
   addPass(createSinkingPass());
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -50,6 +50,7 @@
   SIInsertWaits.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
+  SILoadShrink.cpp
   SILoadStoreOptimizer.cpp
   SILowerControlFlow.cpp
   SILowerI1Copies.cpp
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -42,8 +42,6 @@
   SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
-  void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
-
   SDValue performUCharToFloatCombine(SDNode *N,
                                      DAGCombinerInfo &DCI) const;
   SDValue performSHLPtrCombine(SDNode *N,
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1999,97 +1999,6 @@
   return -1;
 }
 
-/// \brief Helper function for adjustWritemask
-static unsigned SubIdx2Lane(unsigned Idx) {
-  switch (Idx) {
-  default: return 0;
-  case AMDGPU::sub0: return 0;
-  case AMDGPU::sub1: return 1;
-  case AMDGPU::sub2: return 2;
-  case AMDGPU::sub3: return 3;
-  }
-}
-
-/// \brief Adjust the writemask of MIMG instructions
-void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
-                                       SelectionDAG &DAG) const {
-  SDNode *Users[4] = { };
-  unsigned Lane = 0;
-  unsigned OldDmask = Node->getConstantOperandVal(0);
-  unsigned NewDmask = 0;
-
-  // Try to figure out the used register components
-  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
-       I != E; ++I) {
-
-    // Abort if we can't understand the usage
-    if (!I->isMachineOpcode() ||
-        I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
-      return;
-
-    // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
-    // Note that subregs are packed, i.e. Lane==0 is the first bit set
-    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
-    // set, etc.
-    Lane = SubIdx2Lane(I->getConstantOperandVal(1));
-
-    // Set which texture component corresponds to the lane.
-    unsigned Comp;
-    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
-      assert(Dmask);
-      Comp = countTrailingZeros(Dmask);
-      Dmask &= ~(1 << Comp);
-    }
-
-    // Abort if we have more than one user per component
-    if (Users[Lane])
-      return;
-
-    Users[Lane] = *I;
-    NewDmask |= 1 << Comp;
-  }
-
-  // Abort if there's no change
-  if (NewDmask == OldDmask)
-    return;
-
-  // Adjust the writemask in the node
-  std::vector<SDValue> Ops;
-  Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
-  Ops.insert(Ops.end(), Node->op_begin() + 1, Node->op_end());
-  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
-
-  // If we only got one lane, replace it with a copy
-  // (if NewDmask has only one bit set...)
-  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
-    SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
-                                       MVT::i32);
-    SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
-                                      SDLoc(), Users[Lane]->getValueType(0),
-                                      SDValue(Node, 0), RC);
-    DAG.ReplaceAllUsesWith(Users[Lane], Copy);
-    return;
-  }
-
-  // Update the users of the node with the new indices
-  for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
-
-    SDNode *User = Users[i];
-    if (!User)
-      continue;
-
-    SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
-    DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
-
-    switch (Idx) {
-    default: break;
-    case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
-    case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
-    case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
-    }
-  }
-}
-
 static bool isFrameIndexOp(SDValue Op) {
   if (Op.getOpcode() == ISD::AssertZext)
     Op = Op.getOperand(0);
@@ -2125,8 +2034,25 @@
   const SIInstrInfo *TII =
       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
-  if (TII->isMIMG(Node->getMachineOpcode()))
-    adjustWritemask(Node, DAG);
+  if (TII->isMIMG(Node->getMachineOpcode())) {
+    unsigned NumWritten = countPopulation(Node->getConstantOperandVal(0));
+    if (NumWritten == 1) {
+      SDValue RC =
+          DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(), MVT::i32);
+      SDNode *Copy = DAG.getMachineNode(
+          TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
+          Node->getValueType(0).getVectorElementType(), SDValue(Node, 0), RC);
+
+      for (SDNode *Use : Node->uses()) {
+        if (Use != Copy) {
+          assert(Use->isMachineOpcode() &&
+                 Use->getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG);
+          DAG.ReplaceAllUsesWith(Use, Copy);
+        }
+      }
+    }
+    return Node;
+  }
 
   if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
       Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
Index: lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- lib/Target/AMDGPU/SIInstructions.td
+++ lib/Target/AMDGPU/SIInstructions.td
@@ -2982,6 +2982,56 @@
 defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_OFFEN,
                          BUFFER_LOAD_DWORDX4_IDXEN, BUFFER_LOAD_DWORDX4_BOTHEN>;
 
+multiclass MUBUF_Load_Format <ValueType vt, MUBUF offset, MUBUF offen,
+                              MUBUF idxen, MUBUF bothen> {
+  def : Pat <
+    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset,
+                                       0, 0, imm:$glc, imm:$slc, imm:$tfe)),
+    (offset $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+            (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset,
+                                       i32:$voffset, 0, imm:$glc, imm:$slc,
+                                       imm:$tfe)),
+    (offen $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+           (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset,
+                                       0, i32:$vindex, imm:$glc, imm:$slc,
+                                       imm:$tfe)),
+    (idxen $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i1imm $glc),
+            (as_i1imm $slc), (as_i1imm $tfe))
+  >;
+
+  def : Pat <
+    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$soffset, imm:$offset,
+                                       i32:$voffset, i32:$vindex, imm:$glc,
+                                       imm:$slc, imm:$tfe)),
+    (bothen (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc,
+            $soffset, (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc),
+            (as_i1imm $tfe))
+  >;
+}
+
+defm : MUBUF_Load_Format <f32, BUFFER_LOAD_FORMAT_X_OFFSET,
+                          BUFFER_LOAD_FORMAT_X_OFFEN,
+                          BUFFER_LOAD_FORMAT_X_IDXEN,
+                          BUFFER_LOAD_FORMAT_X_BOTHEN>;
+
+defm : MUBUF_Load_Format <v2f32, BUFFER_LOAD_FORMAT_XY_OFFSET,
+                          BUFFER_LOAD_FORMAT_XY_OFFEN,
+                          BUFFER_LOAD_FORMAT_XY_IDXEN,
+                          BUFFER_LOAD_FORMAT_XY_BOTHEN>;
+
+defm : MUBUF_Load_Format <v4f32, BUFFER_LOAD_FORMAT_XYZW_OFFSET,
+                          BUFFER_LOAD_FORMAT_XYZW_OFFEN,
+                          BUFFER_LOAD_FORMAT_XYZW_IDXEN,
+                          BUFFER_LOAD_FORMAT_XYZW_BOTHEN>;
+
 class MUBUFScratchStorePat <MUBUF Instr, ValueType vt, PatFrag st> : Pat <
   (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
                                u16imm:$offset)),
Index: lib/Target/AMDGPU/SILoadShrink.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/SILoadShrink.cpp
@@ -0,0 +1,246 @@
+//===-- SILoadShrink.cpp - Shrink load intrinsics -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Target-specific intrinsics that perform loads, such as buffer.load.format.*
+// and TODO (tex)
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "si-load-shrink"
+
+using namespace llvm;
+
+namespace {
+
+class SILoadShrink : public FunctionPass, public InstVisitor<SILoadShrink> {
+public:
+  static char ID;
+
+  SILoadShrink() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  const char *getPassName() const override { return "SI Load Shrink"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  void visitCallInst(CallInst &C);
+  void visitIntrinsicInst(IntrinsicInst &I);
+
+private:
+  void adjustReturnType(IntrinsicInst &I);
+  void adjustWritemask(CallInst &Call, int DMaskIndex);
+
+  bool AnyChanges;
+  std::vector<Instruction *> Replaced;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILoadShrink, DEBUG_TYPE, "SI Load Shrink", false, false)
+INITIALIZE_PASS_END(SILoadShrink, DEBUG_TYPE, "SI Load Shrink", false, false)
+
+char SILoadShrink::ID = 0;
+
+FunctionPass *llvm::createSILoadShrinkPass() { return new SILoadShrink; }
+
+bool SILoadShrink::runOnFunction(Function &F) {
+  AnyChanges = false;
+
+  visit(F);
+
+  for (Instruction *I : Replaced) {
+    I->eraseFromParent();
+  }
+  Replaced.clear();
+
+  return AnyChanges;
+}
+
+void SILoadShrink::visitCallInst(CallInst &I) {
+  Function *Callee = I.getCalledFunction();
+  if (!Callee)
+    return;
+
+  // TODO Move SI intrinsics to global list so IntrinsicID works?
+  static const char SI_image_sample[] = "llvm.SI.image.sample.";
+  static const char SI_gather4[] = "llvm.SI.gather4.";
+  if (Callee->getName().startswith(SI_image_sample) ||
+      Callee->getName().startswith(SI_gather4)) {
+    adjustWritemask(I, 3);
+  }
+}
+
+void SILoadShrink::visitIntrinsicInst(IntrinsicInst &I) {
+  Function *Callee = I.getCalledFunction();
+
+  if (Callee->getIntrinsicID() == Intrinsic::amdgcn_buffer_load_format) {
+    adjustReturnType(I);
+  }
+}
+
+void SILoadShrink::adjustReturnType(IntrinsicInst &I) {
+  Type *OrigType = I.getType();
+  if (!OrigType->isVectorTy())
+    return;
+
+  const unsigned OrigNumWritten = OrigType->getVectorNumElements();
+  uint64_t HighestIndex = 0;
+
+  SmallVector<ExtractElementInst *, 4> Uses;
+  for (const auto &use : I.uses()) {
+    ExtractElementInst *EE = dyn_cast<ExtractElementInst>(use.getUser());
+    if (!EE) {
+      return;
+    }
+
+    ConstantInt *IndexConstant = dyn_cast<ConstantInt>(EE->getIndexOperand());
+    if (!IndexConstant) {
+      return;
+    }
+
+    Uses.push_back(EE);
+
+    if (EE->use_empty())
+      continue;
+
+    HighestIndex = std::max(HighestIndex, IndexConstant->getZExtValue());
+    if (HighestIndex + 1 >= OrigNumWritten)
+      return;
+  }
+
+  unsigned NewNumWritten = HighestIndex + 1;
+  if (NewNumWritten == 3) {
+    NewNumWritten = 4; // TODO XYZ codegen
+    if (NewNumWritten >= OrigNumWritten) {
+      return;
+    }
+  }
+
+  DEBUG(dbgs() << "SILoadShrink: from " << OrigNumWritten << " to "
+               << NewNumWritten << "\n");
+
+  IRBuilder<> Builder(&I);
+
+  Type *NewTypes[1];
+  if (NewNumWritten == 1) {
+    NewTypes[0] = Type::getFloatTy(I.getContext());
+  } else {
+    NewTypes[0] =
+        VectorType::get(OrigType->getVectorElementType(), NewNumWritten);
+  }
+
+  Function *NewCallee =
+      Intrinsic::getDeclaration(I.getModule(), I.getIntrinsicID(), NewTypes);
+  SmallVector<Value *, 8> Args;
+  for (const Use &Arg : I.arg_operands())
+    Args.push_back(Arg.get());
+  CallInst *NewCall = Builder.CreateCall(NewCallee, Args);
+
+  for (ExtractElementInst *EE : Uses) {
+    if (!EE->use_empty()) {
+      if (NewNumWritten == 1) {
+        EE->replaceAllUsesWith(NewCall);
+      } else {
+        EE->replaceAllUsesWith(
+            Builder.CreateExtractElement(NewCall, EE->getIndexOperand()));
+      }
+    }
+    Replaced.push_back(EE);
+  }
+
+  Replaced.push_back(&I);
+
+  AnyChanges = true;
+}
+
+void SILoadShrink::adjustWritemask(CallInst &Call, int DMaskIndex) {
+  ConstantInt *DMaskConstant =
+      dyn_cast<ConstantInt>(Call.getArgOperand(DMaskIndex));
+  if (!DMaskConstant)
+    return;
+
+  const unsigned OrigDMask = DMaskConstant->getZExtValue();
+  const unsigned OrigNumWritten = countPopulation(OrigDMask);
+
+  // Collect all uses, bailing out early when everything is used
+  const unsigned UseAllMask = (1 << OrigNumWritten) - 1;
+  unsigned UseMask = 0;
+  SmallVector<std::pair<unsigned, Value *>, 4> Uses;
+
+  for (const auto &use : Call.uses()) {
+    ExtractElementInst *EE = dyn_cast<ExtractElementInst>(use.getUser());
+    if (!EE) {
+      return;
+    }
+
+    ConstantInt *IndexConstant = dyn_cast<ConstantInt>(EE->getIndexOperand());
+    if (!IndexConstant) {
+      return;
+    }
+
+    if (EE->use_empty())
+      continue;
+
+    const unsigned Index = IndexConstant->getZExtValue();
+    UseMask |= 1 << Index;
+    if (UseMask == UseAllMask)
+      return;
+
+    Uses.emplace_back(Index, EE);
+  }
+
+  // If not all written channels are used, compute index remapping
+  Type *Int32Ty = Type::getInt32Ty(Call.getContext());
+  SmallVector<Value *, 4> Remapped;
+  unsigned NewDMask = 0;
+  unsigned NewIndex = 0;
+  unsigned Component = 0;
+  for (unsigned OrigIndex = 0; OrigIndex < OrigNumWritten;
+       ++OrigIndex, ++Component) {
+    while ((OrigDMask & (1 << Component)) == 0)
+      ++Component;
+
+    ExtractElementInst *NewEE = nullptr;
+    if (UseMask & (1 << OrigIndex)) {
+      if (OrigIndex != NewIndex) {
+        errs() << "  Remap " << OrigIndex << " to " << NewIndex << "\n";
+        NewEE = ExtractElementInst::Create(&Call,
+                                           ConstantInt::get(Int32Ty, NewIndex));
+        NewEE->insertAfter(&Call);
+      }
+
+      NewDMask |= 1 << Component;
+      ++NewIndex;
+    }
+
+    Remapped.push_back(NewEE);
+  }
+
+  // Commit remapping
+  Call.setArgOperand(DMaskIndex, ConstantInt::get(Int32Ty, NewDMask));
+
+  for (const auto &OldIndexUse : Uses) {
+    if (Value *RemappedValue = Remapped[OldIndexUse.first]) {
+      OldIndexUse.second->replaceAllUsesWith(RemappedValue);
+    }
+  }
+
+  AnyChanges = true;
+}
Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=verde -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
+
+; TODO:
+;  - check soffset & (immediate) offset
+;  - check glc/slc
+;  - check vector versions
+;  - v3f32 version: how?
+
+; CHECK-LABEL: {{^}}main:
+; CHECK: buffer_load_format_x {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 ; encoding
+; CHECK: buffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen ; encoding
+; CHECK: buffer_load_format_x {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; encoding
+; CHECK: buffer_load_format_x {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen ; encoding
+; CHECK: s_endpgm
+
+define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 {
+main_body:
+  %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
+  %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0
+  %rsrc = bitcast <16 x i8> %12 to <4 x i32>
+  %vgpr0 = add i32 %5, %7
+  %r0 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0, i1 0, i1 0, i1 0)
+  %r1 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 %5, i1 0, i1 0, i1 0)
+  %r2 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 %7, i32 0, i1 0, i1 0, i1 0)
+  %r3 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i32 %7, i32 %5, i1 0, i1 0, i1 0)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r0, float %r1, float %r2, float %r3)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i32, i32, i1, i1, i1) #1
+declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i32, i32, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i32, i32, i1, i1, i1) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!"const", null, i32 1}