Index: llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
===================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ llvm/trunk/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -141,6 +141,18 @@
   }
 }
 
+static bool memAccessesCanBeReordered(
+  MachineBasicBlock::iterator A,
+  MachineBasicBlock::iterator B,
+  const SIInstrInfo *TII,
+  llvm::AliasAnalysis * AA) {
+  return (TII->areMemAccessesTriviallyDisjoint(*A, *B, AA) ||
+    // RAW or WAR - cannot reorder
+    // WAW - cannot reorder
+    // RAR - safe to reorder
+    !(A->mayStore() || B->mayStore()));
+}
+
 // Add MI and its defs to the lists if MI reads one of the defs that are
 // already in the list. Returns true in that case.
 static bool
@@ -173,8 +185,8 @@
   for (MachineInstr *InstToMove : InstsToMove) {
     if (!InstToMove->mayLoadOrStore())
       continue;
-    if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA))
-      return false;
+    if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
+        return false;
   }
   return true;
 }
@@ -233,7 +245,7 @@
         return E;
 
       if (MBBI->mayLoadOrStore() &&
-          !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) {
+        !memAccessesCanBeReordered(*I, *MBBI, TII, AA)) {
         // We fail condition #1, but we may still be able to satisfy condition
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
@@ -288,8 +300,10 @@
     // We could potentially keep looking, but we'd need to make sure that
     // it was safe to move I and also all the instruction in InstsToMove
     // down past this instruction.
-    // FIXME: This is too conservative.
-    break;
+    if (!memAccessesCanBeReordered(*I, *MBBI, TII, AA) ||   // check if we can move I across MBBI
+      !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA) // check if we can move all I's users
+     )
+      break;
   }
   return E;
 }
Index: llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll
===================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll
@@ -493,6 +493,46 @@
   ret void
 }
 
+; SI-LABEL: ds_read_diff_base_interleaving
+; SI-NOT: ds_read_b32
+define amdgpu_kernel void @ds_read_diff_base_interleaving(
+  float addrspace(1)* nocapture %arg,
+  [4 x [4 x float]] addrspace(3)* %arg1,
+  [4 x [4 x float]] addrspace(3)* %arg2,
+  [4 x [4 x float]] addrspace(3)* %arg3,
+  [4 x [4 x float]] addrspace(3)* %arg4) #1 {
+bb:
+  %tmp = getelementptr float, float addrspace(1)* %arg, i64 10
+  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
+  %tmp7 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 0
+  %tmp8 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 0, i32 %tmp5
+  %tmp9 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 0
+  %tmp10 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 0, i32 %tmp5
+  %tmp11 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg1, i32 0, i32 %tmp6, i32 1
+  %tmp12 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg2, i32 0, i32 1, i32 %tmp5
+  %tmp13 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg3, i32 0, i32 %tmp6, i32 1
+  %tmp14 = getelementptr [4 x [4 x float]], [4 x [4 x float]] addrspace(3)* %arg4, i32 0, i32 1, i32 %tmp5
+  %tmp15 = load float, float addrspace(3)* %tmp7
+  %tmp16 = load float, float addrspace(3)* %tmp8
+  %tmp17 = fmul float %tmp15, %tmp16
+  %tmp18 = fadd float 2.000000e+00, %tmp17
+  %tmp19 = load float, float addrspace(3)* %tmp9
+  %tmp20 = load float, float addrspace(3)* %tmp10
+  %tmp21 = fmul float %tmp19, %tmp20
+  %tmp22 = fsub float %tmp18, %tmp21
+  %tmp23 = load float, float addrspace(3)* %tmp11
+  %tmp24 = load float, float addrspace(3)* %tmp12
+  %tmp25 = fmul float %tmp23, %tmp24
+  %tmp26 = fsub float %tmp22, %tmp25
+  %tmp27 = load float, float addrspace(3)* %tmp13
+  %tmp28 = load float, float addrspace(3)* %tmp14
+  %tmp29 = fmul float %tmp27, %tmp28
+  %tmp30 = fsub float %tmp26, %tmp29
+  store float %tmp30, float addrspace(1)* %tmp
+  ret void
+}
+
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.amdgcn.workgroup.id.x() #1