diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -713,39 +713,6 @@
 defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
 defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
 
-// Prefer ds_read over ds_read2, all other things being equal, because it has
-// a larger immediate offset range.
-let AddedComplexity = 100 in {
-
-foreach vt = VReg_64.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
-}
-
-let SubtargetPredicate = isGFX7Plus in {
-
-foreach vt = VReg_96.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
-}
-
-// For performance reasons restrict this to alignment >= 16 even with
-// unaligned-access-mode. At lower alignments ds_read2_b64 is always a better
-// choice.
-foreach vt = VReg_128.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
-}
-
-let SubtargetPredicate = HasUnalignedAccessMode in {
-
-foreach vt = VReg_96.RegTypes in {
-defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
-}
-
-} // End SubtargetPredicate = HasUnalignedAccessMode
-
-} // End SubtargetPredicate = isGFX7Plus
-
-} // End AddedComplexity = 100
-
 let OtherPredicates = [D16PreservesUnusedBits] in {
 def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2i16>;
 def : DSReadPat_D16<DS_READ_U16_D16_HI, load_d16_hi_local, v2f16>;
@@ -869,33 +836,51 @@
 defm : DS128Bit8ByteAlignedPat_mc<vt>;
 }
 
-// Prefer ds_write over ds_write2, all other things being equal, because it has
-// a larger immediate offset range.
+// Prefer ds_read over ds_read2 and ds_write over ds_write2, all other things
+// being equal, because it has a larger immediate offset range.
 let AddedComplexity = 100 in {
 
+foreach vt = VReg_64.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
+}
+
 foreach vt = VReg_64.RegTypes in {
 defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">;
 }
 
 let SubtargetPredicate = isGFX7Plus in {
 
+foreach vt = VReg_96.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B96, vt, "load_align16_local">;
+}
+
 foreach vt = VReg_96.RegTypes in {
 defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_align16_local">;
 }
 
-// For performance reasons restrict this to alignment >= 16 even with
-// unaligned-access-mode. At lower alignments ds_write2_b64 is always a better
-// choice.
+foreach vt = VReg_128.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B128, vt, "load_align16_local">;
+}
+
 foreach vt = VReg_128.RegTypes in {
 defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align16_local">;
 }
 
 let SubtargetPredicate = HasUnalignedAccessMode in {
 
+// FIXME: Is ds_read_b96/ds_write_b96 better choice in unaligned-access-mode?
+foreach vt = VReg_96.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B96, vt, "load_local">;
+}
+
 foreach vt = VReg_96.RegTypes in {
 defm : DSWritePat_mc <DS_WRITE_B96, vt, "store_local">;
 }
 
+// For performance reasons, *do not* select ds_read_b128/ds_write_b128 in
+// unaligned-access-mode. At lower alignments ds_read2_b64/ds_write2_b64 is
+// always a better choice.
+
 } // End SubtargetPredicate = HasUnalignedAccessMode
 
 } // End SubtargetPredicate = isGFX7Plus
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1412,10 +1412,15 @@
       return true;
     }
 
+    // Either, the alignment requirements are "enabled", or there is an
+    // unaligned LDS access related hardware bug though alignment requirements
+    // are "disabled". In either case, we need to check for proper alignment
+    // requirements.
+    //
     if (Size == 64) {
-      // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
-      // aligned, 8 byte access in a single operation using ds_read2/write2_b32
-      // with adjacent offsets.
+      // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
+      // can do a 4 byte aligned, 8 byte access in a single operation using
+      // ds_read2/write2_b32 with adjacent offsets.
       bool AlignedBy4 = Alignment >= Align(4);
       if (IsFast)
         *IsFast = AlignedBy4;
@@ -1423,22 +1428,22 @@
       return AlignedBy4;
     }
     if (Size == 96) {
-      // ds_read/write_b96 require 16-byte alignment on gfx8 and older.
-      bool Aligned = Alignment >= Align(16);
+      // 12 byte accessing via ds_read/write_b96 require 16-byte alignment.
+      bool AlignedBy16 = Alignment >= Align(16);
       if (IsFast)
-        *IsFast = Aligned;
+        *IsFast = AlignedBy16;
 
-      return Aligned;
+      return AlignedBy16;
     }
     if (Size == 128) {
-      // ds_read/write_b128 require 16-byte alignment on gfx8 and older, but we
-      // can do a 8 byte aligned, 16 byte access in a single operation using
+      // 16 byte accessing via ds_read/write_b128 require 16-byte alignment, but
+      // we can do a 8 byte aligned, 16 byte access in a single operation using
       // ds_read2/write2_b64.
-      bool Aligned = Alignment >= Align(8);
+      bool AlignedBy8 = Alignment >= Align(8);
       if (IsFast)
-        *IsFast = Aligned;
+        *IsFast = AlignedBy8;
 
-      return Aligned;
+      return AlignedBy8;
     }
   }