Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -584,61 +584,54 @@
                                                       unsigned AddrSpace,
                                                       unsigned Align,
                                                       bool *IsFast) const {
+  // If the access is allowed, it is expected to be similarly fast as an aligned
+  // access in all cases except one.
   if (IsFast)
-    *IsFast = false;
+    *IsFast = true;
 
   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
   // which isn't a simple VT.
   // Until MVT is extended to handle this, simply check for the size and
   // rely on the condition below: allow accesses if the size is a multiple of 4.
-  if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
-                           VT.getStoreSize() > 16)) {
+  if (VT == MVT::Other)
     return false;
-  }
+
+  // 4-byte alignment is always good in all address spaces.
+  //
+  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
+  // byte-address are ignored, thus forcing Dword alignment.
+  // This applies to private, global, and constant memory.
+  bool AlignedBy4 = (Align % 4 == 0);
+  if (AlignedBy4)
+    return true;
 
   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
       AddrSpace == AMDGPUAS::REGION_ADDRESS) {
-    // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
-    // aligned, 8 byte access in a single operation using ds_read2/write2_b32
-    // with adjacent offsets.
-    bool AlignedBy4 = (Align % 4 == 0);
-    if (IsFast)
-      *IsFast = AlignedBy4;
-
-    return AlignedBy4;
+    // For local/region, ds_read/write_b64 require 8-byte alignment, but we can
+    // do a 4 byte aligned, 8 byte access in a single operation using
+    // ds_read2/write2_b32 with adjacent offsets.
+    return false;
   }
 
-  // FIXME: We have to be conservative here and assume that flat operations
-  // will access scratch.  If we had access to the IR function, then we
-  // could determine if any private memory was used in the function.
   if (!Subtarget->hasUnalignedScratchAccess() &&
       (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
        AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
+    // FIXME: We have to be conservative here and assume that flat operations
+    // will access scratch.  If we had access to the IR function, then we could
+    // determine if any private memory was used in the function.
     return false;
   }
 
   if (Subtarget->hasUnalignedBufferAccess()) {
     // If we have an uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
-    if (IsFast) {
-      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
-        (Align % 4 == 0) : true;
-    }
-
+    if (IsFast)
+      *IsFast = (AddrSpace != AMDGPUAS::CONSTANT_ADDRESS);
     return true;
   }
 
   // Smaller than dword value must be aligned.
-  if (VT.bitsLT(MVT::i32))
-    return false;
-
-  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
-  // byte-address are ignored, thus forcing Dword alignment.
-  // This applies to private, global, and constant memory.
-  if (IsFast)
-    *IsFast = true;
-
-  return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+  return false;
 }
 
 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
Index: test/CodeGen/AMDGPU/unaligned-load-store.ll
===================================================================
--- test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-buffer-access -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=UNALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-buffer-access,+unaligned-scratch-access -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=UNALIGNED %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
 
 ; SI-LABEL: {{^}}local_unaligned_load_store_i16:
@@ -601,4 +601,82 @@
   ret void
 }
 
+; SI-LABEL: {{^}}private_store_align4_v16i64:
+; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
+; SI-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
+
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:4
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:8
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:12
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:16
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:20
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:24
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:28
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:32
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:36
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:40
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:44
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:48
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:52
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:56
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:60
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:64
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:68
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:68
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:72
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:76
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:80
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:84
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:88
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:92
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:96
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:100
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:104
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:108
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:112
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:116
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:120
+; SI-DAG: buffer_store_dword [[ZERO]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:124
+define void @private_store_align4_v16i64(<16 x i64>* %out) #0 {
+  store <16 x i64> zeroinitializer, <16 x i64>* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}private_store_align1_v2i16:
+; UNALIGNED: buffer_store_dword
+
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+; ALIGNED: buffer_store_byte
+define void @private_store_align1_v2i16(<2 x i16>* %out) #0 {
+  store <2 x i16> zeroinitializer, <2 x i16>* %out, align 1
+  ret void
+}
+
+; SI-LABEL: {{^}}private_store_align2_v2i16:
+; UNALIGNED: buffer_store_dword
+
+; ALIGNED: buffer_store_short
+; ALIGNED: buffer_store_short
+define void @private_store_align2_v2i16(<2 x i16>* %out) #0 {
+  store <2 x i16> zeroinitializer, <2 x i16>* %out, align 2
+  ret void
+}
+
+; SI-LABEL: {{^}}private_store_align4_v2i16:
+; SI: buffer_store_dword
+define void @private_store_align4_v2i16(<2 x i16>* %out) #0 {
+  store <2 x i16> zeroinitializer, <2 x i16>* %out, align 4
+  ret void
+}
+
+; SI-LABEL: {{^}}private_store_align8_v2i16:
+; SI: buffer_store_dword
+define void @private_store_align8_v2i16(<2 x i16>* %out) #0 {
+  store <2 x i16> zeroinitializer, <2 x i16>* %out, align 8
+  ret void
+}
+
 attributes #0 = { nounwind }
Index: test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
===================================================================
--- test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
+++ test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
@@ -7,15 +7,7 @@
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32
-; ALIGNED: store i32
-; ALIGNED: store i32
-; ALIGNED: store i32
-; ALIGNED: store i32
-
-; ELT8-UNALIGNED: store <2 x i32>
-; ELT8-UNALIGNED: store <2 x i32>
-
-; ELT16-UNALIGNED: store <4 x i32>
+; ALL-UNALIGNED: store <4 x i32>
 define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2