Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1459,29 +1459,50 @@
     }
   }
 
+  const AMDGPUTargetLowering *TLI
+    = static_cast<const AMDGPUTargetLowering *>(getTargetLowering());
+
   for (LoadSDNode *LD : LoadsToReplace) {
     SDLoc SL(LD);
 
     SDValue NewLoad = CurDAG->getLoad(MVT::v2i32, SL, LD->getChain(),
                                       LD->getBasePtr(), LD->getMemOperand());
-    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL,
-                                      MVT::i64, NewLoad);
-    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
+
+    SDValue ReplaceLoad;
+    SDValue ReplaceChain;
+    if (LD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+        Subtarget->getMaxPrivateElementSize() < 8) {
+      // XXX - For some reason the merge_values fails to select when emitted
+      // here.
+      SDValue Scalarized = TLI->ScalarizeVectorLoad(NewLoad, *CurDAG);
+      assert(Scalarized.getOpcode() == ISD::MERGE_VALUES);
+      ReplaceLoad = Scalarized.getOperand(0);
+      ReplaceChain = Scalarized.getOperand(1);
+    } else {
+      ReplaceLoad = NewLoad.getValue(0);
+      ReplaceChain = NewLoad.getValue(1);
+    }
+
+    SDValue BitCast = CurDAG->getNode(ISD::BITCAST, SL, MVT::i64, ReplaceLoad);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 1), ReplaceChain);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LD, 0), BitCast);
     Modified = true;
   }
 
   for (StoreSDNode *ST : StoresToReplace) {
-    SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SDLoc(ST),
-                                       MVT::v2i32, ST->getValue());
-    const SDValue StoreOps[] = {
-      ST->getChain(),
-      NewValue,
-      ST->getBasePtr(),
-      ST->getOffset()
-    };
+    SDLoc SL(ST);
+    SDValue NewValue = CurDAG->getNode(ISD::BITCAST, SL, MVT::v2i32,
+                                       ST->getValue());
+
+    SDValue NewStore = CurDAG->getStore(ST->getChain(), SL, NewValue,
+                                        ST->getBasePtr(), ST->getMemOperand());
+
+    if (ST->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
+        Subtarget->getMaxPrivateElementSize() < 8) {
+      NewStore = TLI->ScalarizeVectorStore(NewStore, *CurDAG);
+    }
 
-    CurDAG->UpdateNodeOperands(ST, StoreOps);
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(ST, 0), NewStore);
     Modified = true;
   }
 
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -89,6 +89,7 @@
   SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const;
   SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const;
 
+public:
   /// \brief Split a vector load into a scalar load of each component.
   SDValue ScalarizeVectorLoad(SDValue Op, SelectionDAG &DAG) const;
 
@@ -101,6 +102,7 @@
   /// \brief Split a vector store into 2 stores of half the vector.
   SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
 
+protected:
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -80,7 +80,6 @@
 
   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
-
   // We need to custom lower vector stores from local memory
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
@@ -1602,6 +1601,9 @@
   ISD::LoadExtType ExtType = Load->getExtensionType();
   EVT MemVT = Load->getMemoryVT();
 
+  if (MemVT == MVT::i32)
+    return SDValue();
+
   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
     assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
     // FIXME: Copied from PPC
@@ -1622,13 +1624,9 @@
     return DAG.getMergeValues(Ops, DL);
   }
 
-  if (!MemVT.isVector())
-    return SDValue();
-
   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
          "Custom lowering for non-i32 vectors hasn't been implemented.");
   unsigned NumElements = MemVT.getVectorNumElements();
-  assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
 
   switch (Load->getAddressSpace()) {
   case AMDGPUAS::CONSTANT_ADDRESS:
@@ -1866,6 +1864,9 @@
   StoreSDNode *Store = cast<StoreSDNode>(Op);
   EVT VT = Store->getMemoryVT();
 
+  if (VT == MVT::i32)
+    return SDValue();
+
   if (VT == MVT::i1) {
     return DAG.getTruncStore(Store->getChain(), DL,
        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
Index: test/CodeGen/AMDGPU/private-element-size.ll
===================================================================
--- test/CodeGen/AMDGPU/private-element-size.ll
+++ test/CodeGen/AMDGPU/private-element-size.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT16 -check-prefix=HSA -check-prefix=HSA-ELT16 -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-8 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT8 -check-prefix=HSA -check-prefix=HSA-ELT8 -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT16 -check-prefix=HSA -check-prefix=HSA-ELT16 -check-prefix=ALL -check-prefix=HSA_ELTGE8 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-8 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT8 -check-prefix=HSA -check-prefix=HSA-ELT8 -check-prefix=ALL -check-prefix=HSA-ELTGE8 %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=ELT4 -check-prefix=HSA -check-prefix=HSA-ELT4 -check-prefix=ALL %s
 
 
@@ -48,8 +48,7 @@
   %gep1 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 1
   store <4 x i32> zeroinitializer, <4 x i32>* %gep0
   store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32>* %gep1
-  %idxprom2 = sext i32 %index to i64
-  %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i64 0, i64 %idxprom2
+  %gep2 = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* %alloca, i32 0, i32 %index
   %load = load <4 x i32>, <4 x i32>* %gep2
   store <4 x i32> %load, <4 x i32> addrspace(1)* %out
   ret void
@@ -115,13 +114,135 @@
   %gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 1
   store <8 x i32> zeroinitializer, <8 x i32>* %gep0
   store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* %gep1
-  %idxprom2 = sext i32 %index to i64
-  %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i64 0, i64 %idxprom2
+  %gep2 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>]* %alloca, i32 0, i32 %index
   %load = load <8 x i32>, <8 x i32>* %gep2
   store <8 x i32> %load, <8 x i32> addrspace(1)* %out
   ret void
 }
 
+
+; ALL-LABEL: {{^}}private_elt_size_i64:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8
+
+; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:12{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+define void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = sext i32 %tid to i64
+  %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+  %index.load = load i32, i32 addrspace(1)* %gep.index
+  %index = and i32 %index.load, 2
+  %alloca = alloca [2 x i64], align 16
+  %gep0 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 1
+  store i64 0, i64* %gep0
+  store i64 34359738602, i64* %gep1
+  %gep2 = getelementptr inbounds [2 x i64], [2 x i64]* %alloca, i32 0, i32 %index
+  %load = load i64, i64* %gep2
+  store i64 %load, i64 addrspace(1)* %out
+  ret void
+}
+
+
+; ALL-LABEL: {{^}}private_elt_size_f64:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8
+
+; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:12{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+define void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = sext i32 %tid to i64
+  %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+  %index.load = load i32, i32 addrspace(1)* %gep.index
+  %index = and i32 %index.load, 2
+  %alloca = alloca [2 x double], align 16
+  %gep0 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 1
+  store double 0.0, double* %gep0
+  store double 4.0, double* %gep1
+  %gep2 = getelementptr inbounds [2 x double], [2 x double]* %alloca, i32 0, i32 %index
+  %load = load double, double* %gep2
+  store double %load, double addrspace(1)* %out
+  ret void
+}
+
+; ALL-LABEL: {{^}}private_elt_size_v2i64:
+; HSA-ELT16: private_element_size = 3
+; HSA-ELT8: private_element_size = 2
+; HSA-ELT4: private_element_size = 1
+
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16
+; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen offset:24
+
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen
+; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s7 offen
+
+
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen offset:28{{$}}
+
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
+define void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = sext i32 %tid to i64
+  %gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
+  %index.load = load i32, i32 addrspace(1)* %gep.index
+  %index = and i32 %index.load, 2
+  %alloca = alloca [2 x <2 x i64>], align 16
+  %gep0 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 0
+  %gep1 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 1
+  store <2 x i64> zeroinitializer, <2 x i64>* %gep0
+  store <2 x i64> <i64 1, i64 2>, <2 x i64>* %gep1
+  %gep2 = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* %alloca, i32 0, i32 %index
+  %load = load <2 x i64>, <2 x i64>* %gep2
+  store <2 x i64> %load, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }