Index: lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -240,8 +240,20 @@
   if (!Cluster)
     return;
 
-  // Sort them in increasing order.
-  llvm::sort(Offsets);
+  // Sort them in increasing order, but don't value break dependencies between
+  // the loads.
+  llvm::sort(
+    Offsets.begin(),
+    Offsets.end(),
+    [&O2SMap](int64_t OffsetA, int64_t OffsetB) {
+      const SDNode *A = O2SMap[OffsetA];
+      const SDNode *B = O2SMap[OffsetB];
+      if (A->isOperandOf(B))
+        return true;
+      if (B->isOperandOf(A))
+        return false;
+      return OffsetA < OffsetB;
+    });
 
   // Check if the loads are close enough.
   SmallVector<SDNode*, 4> Loads;
@@ -252,7 +264,14 @@
   for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
     int64_t Offset = Offsets[i];
     SDNode *Load = O2SMap[Offset];
-    if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))
+
+    // These may not be perfectly sorted by offset if there are value
+    // dependencies between some of the loads. Sort them when asking the target.
+    int64_t LowOffset = std::min(BaseOff, Offset);
+    int64_t HighOffset = std::max(BaseOff, Offset);
+
+    if (!TII->shouldScheduleLoadsNear(BaseLoad, Load,
+                                      LowOffset, HighOffset, NumLoads))
       break; // Stop right here. Ignore loads that are further away.
     Loads.push_back(Load);
     ++NumLoads;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9367,51 +9367,6 @@
     Ops.push_back(ImpDef.getValue(1));
     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   }
-  case AMDGPU::FLAT_LOAD_UBYTE_D16_HI:
-  case AMDGPU::FLAT_LOAD_SBYTE_D16_HI:
-  case AMDGPU::FLAT_LOAD_SHORT_D16_HI:
-  case AMDGPU::GLOBAL_LOAD_UBYTE_D16_HI:
-  case AMDGPU::GLOBAL_LOAD_SBYTE_D16_HI:
-  case AMDGPU::GLOBAL_LOAD_SHORT_D16_HI:
-  case AMDGPU::DS_READ_U16_D16_HI:
-  case AMDGPU::DS_READ_I8_D16_HI:
-  case AMDGPU::DS_READ_U8_D16_HI:
-  case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
-  case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
-  case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
-  case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
-  case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
-  case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN: {
-    // For these loads that write to the HI part of a register,
-    // we should chain them to the op that writes to the LO part
-    // of the register to maintain the order.
-    unsigned NumOps = Node->getNumOperands();
-    SDValue OldChain = Node->getOperand(NumOps-1);
-
-    if (OldChain.getValueType() != MVT::Other)
-      break;
-
-    // Look for the chain to replace to.
-    SDValue Lo = Node->getOperand(NumOps-2);
-    SDNode *LoNode = Lo.getNode();
-    if (LoNode->getNumValues() == 1 ||
-        LoNode->getValueType(LoNode->getNumValues() - 1) != MVT::Other)
-      break;
-
-    SDValue NewChain = Lo.getValue(LoNode->getNumValues() - 1);
-    if (NewChain == OldChain) // Already replaced.
-      break;
-
-    SmallVector<SDValue, 16> Ops;
-    for (unsigned I = 0; I < NumOps-1; ++I)
-      Ops.push_back(Node->getOperand(I));
-    // Repalce the Chain.
-    Ops.push_back(NewChain);
-    MachineSDNode *NewNode = DAG.getMachineNode(Opcode, SDLoc(Node),
-                                                Node->getVTList(), Ops);
-    DAG.setNodeMemRefs(NewNode, Node->memoperands());
-    return NewNode;
-  }
   default:
     break;
   }
Index: test/CodeGen/AMDGPU/chain-hi-to-lo.ll
===================================================================
--- test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}chain_hi_to_lo_private:
 ; GCN: buffer_load_ushort [[DST:v[0-9]+]], off, [[RSRC:s\[[0-9]+:[0-9]+\]]], [[SOFF:s[0-9]+]] offset:2
@@ -139,3 +139,39 @@
 
   ret <2 x half> %result
 }
+
+; Make sure we don't lose any of the private stores.
+; GCN-LABEL: {{^}}vload2_private:
+; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4
+; GCN: buffer_store_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6
+; GCN: buffer_store_short v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8
+
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:4
+; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:6
+; GCN: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s{{[0-9]+}} offset:8
+define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 {
+entry:
+  %loc = alloca [3 x i16], align 2, addrspace(5)
+  %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
+  %tmp = load i16, i16 addrspace(1)* %in, align 2
+  %loc.0.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 0
+  store volatile i16 %tmp, i16 addrspace(5)* %loc.0.sroa_idx
+  %arrayidx.1 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
+  %tmp1 = load i16, i16 addrspace(1)* %arrayidx.1, align 2
+  %loc.2.sroa_idx3 = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
+  store volatile i16 %tmp1, i16 addrspace(5)* %loc.2.sroa_idx3
+  %arrayidx.2 = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 2
+  %tmp2 = load i16, i16 addrspace(1)* %arrayidx.2, align 2
+  %loc.4.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 2
+  store volatile i16 %tmp2, i16 addrspace(5)* %loc.4.sroa_idx
+  %loc.0.sroa_cast = bitcast [3 x i16] addrspace(5)* %loc to <2 x i16> addrspace(5)*
+  %loc.0. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.0.sroa_cast, align 2
+  store <2 x i16> %loc.0., <2 x i16> addrspace(1)* %out, align 4
+  %loc.2.sroa_idx = getelementptr inbounds [3 x i16], [3 x i16] addrspace(5)* %loc, i32 0, i32 1
+  %loc.2.sroa_cast = bitcast i16 addrspace(5)* %loc.2.sroa_idx to <2 x i16> addrspace(5)*
+  %loc.2. = load <2 x i16>, <2 x i16> addrspace(5)* %loc.2.sroa_cast, align 2
+  %arrayidx6 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 1
+  store <2 x i16> %loc.2., <2 x i16> addrspace(1)* %arrayidx6, align 4
+  %loc.0.sroa_cast2 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
+  ret void
+}