Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -73,7 +73,7 @@
 
   // Set defaults if needed.
   if (MaxPrivateElementSize == 0)
-    MaxPrivateElementSize = 16;
+    MaxPrivateElementSize = 4;
 
   return *this;
 }
Index: test/CodeGen/AMDGPU/indirect-private-64.ll
===================================================================
--- test/CodeGen/AMDGPU/indirect-private-64.ll
+++ test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -1,14 +1,20 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA4 -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
 
 declare void @llvm.amdgcn.s.barrier() #1
 
 ; SI-LABEL: {{^}}private_access_f64_alloca:
 
-; SI-ALLOCA: buffer_store_dwordx2
-; SI-ALLOCA: buffer_load_dwordx2
+; SI-ALLOCA16: buffer_store_dwordx2
+; SI-ALLOCA16: buffer_load_dwordx2
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
 
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
@@ -25,8 +31,17 @@
 
 ; SI-LABEL: {{^}}private_access_v2f64_alloca:
 
-; SI-ALLOCA: buffer_store_dwordx4
-; SI-ALLOCA: buffer_load_dwordx4
+; SI-ALLOCA16: buffer_store_dwordx4
+; SI-ALLOCA16: buffer_load_dwordx4
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
 
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_write_b64
@@ -45,8 +60,14 @@
 
 ; SI-LABEL: {{^}}private_access_i64_alloca:
 
-; SI-ALLOCA: buffer_store_dwordx2
-; SI-ALLOCA: buffer_load_dwordx2
+; SI-ALLOCA16: buffer_store_dwordx2
+; SI-ALLOCA16: buffer_load_dwordx2
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+
 
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
@@ -63,8 +84,18 @@
 
 ; SI-LABEL: {{^}}private_access_v2i64_alloca:
 
-; SI-ALLOCA: buffer_store_dwordx4
-; SI-ALLOCA: buffer_load_dwordx4
+; SI-ALLOCA16: buffer_store_dwordx4
+; SI-ALLOCA16: buffer_load_dwordx4
+
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+; SI-ALLOCA4: buffer_store_dword v
+
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
+; SI-ALLOCA4: buffer_load_dword v
 
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_write_b64
Index: test/CodeGen/AMDGPU/insert_vector_elt.ll
===================================================================
--- test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=SI %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
Index: test/CodeGen/AMDGPU/large-alloca-compute.ll
===================================================================
--- test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -7,11 +7,11 @@
 
 ; ALL-LABEL: {{^}}large_alloca_compute_shader:
 
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x98f000
-; VI: s_mov_b32 s11, 0x980000
+; GCN: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s{{[0-9]+}}, -1
+; CI-NEXT: s_mov_b32 s{{[0-9]+}}, 0x88f000
+; VI-NEXT: s_mov_b32 s{{[0-9]+}}, 0x880000
 
 
 ; GCNHSA: .amd_kernel_code_t
Index: test/CodeGen/AMDGPU/large-alloca-graphics.ll
===================================================================
--- test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -3,10 +3,10 @@
 
 ; ALL-LABEL: {{^}}large_alloca_pixel_shader:
 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x98f000
-; VI: s_mov_b32 s11, 0x980000
+; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s10, -1
+; CI-NEXT: s_mov_b32 s11, 0x88f000
+; VI-NEXT: s_mov_b32 s11, 0x880000
 
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
@@ -24,10 +24,10 @@
 
 ; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg:
 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x98f000
-; VI: s_mov_b32 s11, 0x980000
+; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s10, -1
+; CI-NEXT: s_mov_b32 s11, 0x88f000
+; VI-NEXT: s_mov_b32 s11, 0x880000
 
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
===================================================================
--- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -23,14 +23,21 @@
 ; GCNMESA: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCNMESA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCNMESA-NEXT: s_mov_b32 s14, -1
-; SIMESA-NEXT: s_mov_b32 s15, 0x98f000
-; VIMESA-NEXT: s_mov_b32 s15, 0x980000
+; SIMESA-NEXT: s_mov_b32 s15, 0x88f000
+; VIMESA-NEXT: s_mov_b32 s15, 0x880000
 
 
 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
 
-; GCN: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
-; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_store_dword {{v[0-9]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
 
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1024
Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
===================================================================
--- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -15,8 +15,8 @@
 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: s_mov_b32 s15, 0x98f000
-; VI-NEXT: s_mov_b32 s15, 0x980000
+; SI-NEXT: s_mov_b32 s15, 0x88f000
+; VI-NEXT: s_mov_b32 s15, 0x880000
 
 ; s12 is offset user SGPR
 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill