Index: lib/Target/AMDGPU/AMDGPUInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -193,6 +193,8 @@
   /// equivalent opcode that writes \p Channels Channels.
   int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
 
+  /// Return the equivalent opcode that loads \p Size dwords.
+  int getResizedLoadOp(uint16_t Opcode, unsigned Size) const;
 };
 
 namespace AMDGPU {
Index: lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -323,6 +323,19 @@
   }
 }
 
+int AMDGPUInstrInfo::getResizedLoadOp(uint16_t Opcode, unsigned int Size) const
+{
+  AMDGPU::Size InSize;
+  switch (Size) {
+  case 1: InSize = AMDGPU::Size_1; break;
+  case 2: InSize = AMDGPU::Size_2; break;
+  case 3: InSize = AMDGPU::Size_3; break;
+  case 4: InSize = AMDGPU::Size_4; break;
+  default: return -1;
+  }
+  return AMDGPU::getResizedLoadOp(Opcode, InSize);
+}
+
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
 // header files, so we need to wrap it in a function that takes unsigned
 // instead.
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -43,6 +43,9 @@
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+  void adjustLoadSize(MachineSDNode *&N, SelectionDAG &DAG) const;
+
+  LaneBitmask findUsedLanes(SDNode *N) const;
 
   SDValue performUCharToFloatCombine(SDNode *N,
                                      DAGCombinerInfo &DCI) const;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -33,6 +33,8 @@
 #include "llvm/IR/Function.h"
 #include "llvm/ADT/SmallString.h"
 
+#define DEBUG_TYPE "si-isel-lowering"
+
 using namespace llvm;
 
 SITargetLowering::SITargetLowering(TargetMachine &TM,
@@ -2090,6 +2092,89 @@
   }
 }
 
+/// Returns bitmask of the lanes of the vector result of \p N that are actually
+/// used. Early out is automatically taken when the highest lane is used; ~0 is
+/// returned in this case.
+LaneBitmask SITargetLowering::findUsedLanes(SDNode *N) const
+{
+  assert(N->getNumValues() == 1);
+  const unsigned NumElements = N->getValueType(0).getVectorNumElements();
+  const LaneBitmask HighestMask = (LaneBitmask)1 << (NumElements - 1);
+  LaneBitmask Mask = 0;
+
+  for (SDNode *use : N->uses()) {
+    if (!use->isMachineOpcode()) {
+      DEBUG(dbgs() << "findUsedLanes: non-machine opcode\n");
+      return ~(LaneBitmask)0;
+    }
+
+    if (use->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) {
+      DEBUG(dbgs() << "findUsedLanes: unsupported opcode "
+        << use->getMachineOpcode());
+      return ~(LaneBitmask)0;
+    }
+
+    Mask |= Subtarget->getRegisterInfo()->getSubRegIndexLaneMask(
+      use->getConstantOperandVal(1));
+    if (Mask & HighestMask)
+      return ~(LaneBitmask)0;
+  }
+
+  return Mask;
+}
+
+/// Reduce size of a load instruction if only a prefix of returned channels
+/// are used. Currently only used for BUFFER_LOAD_FORMAT_XYZW.
+void SITargetLowering::adjustLoadSize(
+  MachineSDNode *&N,
+  SelectionDAG &DAG) const
+{
+  EVT OriginalType = N->getValueType(0);
+
+  if (!OriginalType.isVector())
+    return;
+
+  const LaneBitmask Mask = findUsedLanes(N);
+  if (Mask == ~(LaneBitmask)0)
+    return;
+  if (!Mask) {
+    DEBUG(dbgs() << "adjustLoadSize: dead load has not been eliminated\n");
+    return;
+  }
+
+  const unsigned RequiredSize = findLastSet(Mask) + 1;
+
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+  const unsigned NewOpcode = TII->getResizedLoadOp(
+        N->getMachineOpcode(), RequiredSize);
+
+  // Make a temporary copy of operands to avoid problems with in-place mutation.
+  std::vector<SDValue> Ops;
+  Ops.insert(Ops.end(), N->op_begin(), N->op_end());
+
+  if (RequiredSize > 1) {
+    // We do not adjust the type of the node here, because MachineValueTypes
+    // do not support v3f32 types properly. The correct machine register class
+    // will eventually be selected based on the opcode after the
+    // MachineInstruction is built.
+    N = static_cast<MachineSDNode *>(DAG.SelectNodeTo(
+        N, NewOpcode, OriginalType, Ops));
+  } else {
+    // Bypass EXTRACT_SUBREG instructions here, because trying to deal with
+    // v1xx types is a headache.
+    const EVT NewType = OriginalType.getVectorElementType();
+    N = static_cast<MachineSDNode *>(DAG.SelectNodeTo(
+        N, NewOpcode, NewType, Ops));
+
+    for (SDNode *use : N->uses()) {
+      assert(use->getMachineOpcode() == TargetOpcode::EXTRACT_SUBREG);
+
+      DAG.ReplaceAllUsesWith(use, N);
+    }
+  }
+}
+
 static bool isFrameIndexOp(SDValue Op) {
   if (Op.getOpcode() == ISD::AssertZext)
     Op = Op.getOperand(0);
@@ -2125,11 +2210,15 @@
   const SIInstrInfo *TII =
       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
 
-  if (TII->isMIMG(Node->getMachineOpcode()))
-    adjustWritemask(Node, DAG);
+  const unsigned opcode = Node->getMachineOpcode();
 
-  if (Node->getMachineOpcode() == AMDGPU::INSERT_SUBREG ||
-      Node->getMachineOpcode() == AMDGPU::REG_SEQUENCE) {
+  if (TII->isMIMG(opcode)) {
+    adjustWritemask(Node, DAG);
+  } else if (TII->isMUBUF(opcode)) {
+    if (TII->getResizedLoadOp(opcode, 1) >= 0)
+      adjustLoadSize(Node, DAG);
+  } else if (opcode == AMDGPU::INSERT_SUBREG ||
+      opcode == AMDGPU::REG_SEQUENCE) {
     legalizeTargetIndependentNode(Node, DAG);
     return Node;
   }
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -89,6 +89,11 @@
   int VI = 1;
 }
 
+class ResizableLoad<string group, int size> {
+  string Group = group;
+  int Size = size;
+}
+
 //===----------------------------------------------------------------------===//
 // SI DAG Nodes
 //===----------------------------------------------------------------------===//
@@ -2778,6 +2783,14 @@
   let ValueCols = [["1"], ["2"], ["3"] ];
 }
 
+def getResizedLoadOp : InstrMapping {
+  let FilterClass = "ResizableLoad";
+  let RowFields = ["Group", "idxen", "offen", "vaddr", "isPseudo", "Subtarget"];
+  let ColFields = ["Size"];
+  let KeyCol = ["4"];
+  let ValueCols = [["1"], ["2"], ["3"], ["4"]];
+}
+
 // Maps an commuted opcode to its original version
 def getCommuteOrig : InstrMapping {
   let FilterClass = "VOP2_REV";
Index: lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- lib/Target/AMDGPU/SIInstructions.td
+++ lib/Target/AMDGPU/SIInstructions.td
@@ -918,16 +918,16 @@
 
 defm BUFFER_LOAD_FORMAT_X : MUBUF_Load_Helper <
   mubuf<0x00>, "buffer_load_format_x", VGPR_32
->;
+>, ResizableLoad <"buffer_load_format", 1>;
 defm BUFFER_LOAD_FORMAT_XY : MUBUF_Load_Helper <
   mubuf<0x01>, "buffer_load_format_xy", VReg_64
->;
+>, ResizableLoad <"buffer_load_format", 2>;
 defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Load_Helper <
   mubuf<0x02>, "buffer_load_format_xyz", VReg_96
->;
+>, ResizableLoad <"buffer_load_format", 3>;
 defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <
   mubuf<0x03>, "buffer_load_format_xyzw", VReg_128
->;
+>, ResizableLoad <"buffer_load_format", 4>;
 defm BUFFER_STORE_FORMAT_X : MUBUF_Store_Helper <
   mubuf<0x04>, "buffer_store_format_x", VGPR_32
 >;